예제 #1
0
    def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo',
                       lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin',
                       dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic',
                       grammar='data/gramatica-tp2.gram', dataPath='tmp/'):
        self.data_path = dataPath
        config = {
            'hmm': hmm,
            'lm': lm,
            'dict': dict
        }
        #model_path = get_model_path()

        self.ps = Pocketsphinx(**config)
        
        # Switch to JSGF grammar
        jsgf = Jsgf(grammar)
        rule = jsgf.get_rule('tp2.grammar')
        fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5)
        self.ps.set_fsg('tp2', fsg)
        self.ps.set_search('tp2')

        # Síntesis
        self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J')
        self.tts = TextToSpeechV1(authenticator=self.tts_authenticator)
        self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api')
예제 #2
0
    def recognize(self, audio_data, keyword_entries=None, grammar=None):
        language = self.lang
        assert isinstance(audio_data,
                          AudioData), "``audio_data`` must be audio data"
        assert isinstance(language, str), "``language`` must be a string"
        assert keyword_entries is None or all(
            isinstance(keyword,
                       (type(""), type(u""))) and 0 <= sensitivity <= 1
            for keyword, sensitivity in
            keyword_entries), "``keyword_entries`` must be ``None`` or" \
                              " a list of pairs of strings and " \
                              "numbers between 0 and 1"

        # obtain audio data
        raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)
        # obtain recognition results
        if keyword_entries is not None:  # explicitly specified set of keywords
            with PortableNamedTemporaryFile("w") as f:
                # generate a keywords file
                f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity -
                                                  110)
                             for keyword, sensitivity in keyword_entries)
                f.flush()

                # perform the speech recognition with the keywords file
                self.decoder.set_kws("keywords", f.name)
                self.decoder.set_search("keywords")
                self.decoder.start_utt()  # begin utterance processing
                self.decoder.process_raw(raw_data, False, True)
                self.decoder.end_utt()  # stop utterance processing
        elif grammar is not None:  # a path to a FSG or JSGF grammar
            if not os.path.exists(grammar):
                raise ValueError(
                    "Grammar '{0}' does not exist.".format(grammar))
            grammar_path = os.path.abspath(os.path.dirname(grammar))
            grammar_name = os.path.splitext(os.path.basename(grammar))[0]
            fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
            if not os.path.exists(
                    fsg_path):  # create FSG grammar if not available
                jsgf = Jsgf(grammar)
                rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
                fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
                fsg.writefile(fsg_path)
            else:
                fsg = FsgModel(fsg_path, self.decoder.get_logmath(), 7.5)
            self.decoder.set_fsg(grammar_name, fsg)
            self.decoder.set_search(grammar_name)
            self.decoder.start_utt()
            self.decoder.process_raw(raw_data, False, True)
            self.decoder.end_utt()  # stop utterance processing
        else:  # no keywords, perform freeform recognition
            self.decoder.start_utt()  # begin utterance processing
            self.decoder.process_raw(raw_data, False, True)
            self.decoder.end_utt()  # stop utterance processing

        # return results
        hypothesis = self.decoder.hyp()
        if hypothesis is not None:
            return hypothesis.hypstr
        raise UnknownValueError()  # no transcriptions available
예제 #3
0
파일: views.py 프로젝트: dustin-nguyen/MAST
def retrieve_scores(word):
    filename = word + '.wav'
    grammarname = word + '-align.jsgf'
    model_path = get_model_path()

    # Initialize the config values
    config = DefaultConfig()
    config.set_boolean('-verbose', False)
    config.set_string('-hmm', os.path.join(model_path, 'en-us'))
    config.set_boolean('-lm', False)
    config.set_string('-dict', 'phonemes.dict.txt')
    config.set_boolean('-backtrace', True)
    config.set_boolean('-bestpath', False)
    config.set_boolean('-fsgusefiller', False)

    decoder = Decoder(config)

    # Set the search to JSGF Grammar
    jsgf = Jsgf(grammarname)
    rule = jsgf.get_rule('forcing.' + word)
    decoder.set_jsgf_file('grammar', grammarname)
    decoder.set_search('grammar')

    stream = open(filename, 'rb')
    utt_started = False
    scores = []
    decoder.start_utt()

    while True:
        buf = stream.read(1024)
        if buf:
            decoder.process_raw(buf, False, False)
            in_speech = decoder.get_in_speech()
            if (in_speech and not utt_started):
                utt_started = True
            if (not in_speech and utt_started):
                decoder.end_utt()
                hyp = decoder.hyp()
                if hyp is not None:
                    print('hyp: %s' % (hyp.best_score))
                print_segments(decoder)
                scores = retrieve_segments(decoder)
                decoder.start_utt()
                utt_started = False
        else:
            break

    decoder.end_utt()
    print('scores:', scores)

    return scores
예제 #4
0
    def test_jsgf(self):
        ps = Pocketsphinx(lm='deps/pocketsphinx/test/data/turtle.lm.bin',
                          dic='deps/pocketsphinx/test/data/turtle.dic')

        # Decoding with 'turtle' language model
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')

        # Switch to JSGF grammar
        jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram')
        rule = jsgf.get_rule('goforward.move2')
        fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5)
        ps.set_fsg('goforward', fsg)
        ps.set_search('goforward')

        # Decoding with 'goforward' grammar
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')
예제 #5
0
    def test_jsgf(self):
        ps = Pocketsphinx(
            lm='deps/pocketsphinx/test/data/turtle.lm.bin',
            dic='deps/pocketsphinx/test/data/turtle.dic'
        )

        # Decoding with 'turtle' language model
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')

        # Switch to JSGF grammar
        jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram')
        rule = jsgf.get_rule('goforward.move2')
        fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5)
        ps.set_fsg('goforward', fsg)
        ps.set_search('goforward')

        # Decoding with 'goforward' grammar
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')
예제 #6
0
    def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False):
        """
        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
        The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
        If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
        Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
        """
        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
        assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``"
        assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"

        # import the PocketSphinx speech recognition module
        try:
            from pocketsphinx import pocketsphinx, Jsgf, FsgModel

        except ImportError:
            raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
        except ValueError:
            raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.")
        if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"):
            raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.")

        if isinstance(language, str):  # directory containing language data
            language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language)
            if not os.path.isdir(language_directory):
                raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
            acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
            language_model_file = os.path.join(language_directory, "language-model.lm.bin")
            phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
        else:  # 3-tuple of Sphinx data file paths
            acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
        if not os.path.isdir(acoustic_parameters_directory):
            raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
        if not os.path.isfile(language_model_file):
            raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
        if not os.path.isfile(phoneme_dictionary_file):
            raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))

        # create decoder object
        config = pocketsphinx.Decoder.default_config()
        config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files
        config.set_string("-lm", language_model_file)
        config.set_string("-dict", phoneme_dictionary_file)
        config.set_string("-logfn", os.devnull)  # disable logging (logging causes unwanted output in terminal)
        decoder = pocketsphinx.Decoder(config)

        # obtain audio data
        raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)  # the included language models require audio to be 16-bit mono 16 kHz in little-endian format

        # obtain recognition results
        if keyword_entries is not None:  # explicitly specified set of keywords
            with PortableNamedTemporaryFile("w") as f:
                # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
                f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
                f.flush()

                # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
                decoder.set_kws("keywords", f.name)
                decoder.set_search("keywords")
                decoder.start_utt()  # begin utterance processing
                decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
                decoder.end_utt()  # stop utterance processing
        elif grammar is not None:  # a path to a FSG or JSGF grammar
            if not os.path.exists(grammar):
                raise ValueError("Grammar '{0}' does not exist.".format(grammar))
            grammar_path = os.path.abspath(os.path.dirname(grammar))
            grammar_name = os.path.splitext(os.path.basename(grammar))[0]
            fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
            if not os.path.exists(fsg_path):  # create FSG grammar if not available
                jsgf = Jsgf(grammar)
                rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
                fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
                fsg.writefile(fsg_path)
            else:
                fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
            decoder.set_fsg(grammar_name, fsg)
            decoder.set_search(grammar_name)
            decoder.start_utt()
            decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
            decoder.end_utt()  # stop utterance processing
        else:  # no keywords, perform freeform recognition
            decoder.start_utt()  # begin utterance processing
            decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
            decoder.end_utt()  # stop utterance processing

        if show_all: return decoder

        # return results
        hypothesis = decoder.hyp()
        if hypothesis is not None: return hypothesis.hypstr
        raise UnknownValueError()  # no transcriptions available
예제 #7
0
config = nu_config()

r = sr.Recognizer()

language = config.get('language', 'default')

model_dir = os.getcwd(
) + '/nu/modules/brain/senses/text2speech/languages/' + language
grammar_path = model_dir + '/grammars'
config = pocketsphinx.Decoder.default_config()
config.set_string('-hmm', model_dir + '/accoustic-model')
config.set_string('-lm', model_dir + '/language-model.bin')
config.set_string('-dict', model_dir + '/pronounciation-dictionary.dict')
config.set_string("-logfn", os.devnull)
jsgf = Jsgf(grammar_path)

grammar_decoders = []
pattern = re.compile('public <(.*?)> =')

with open(grammar_path, 'rt') as in_file:
    for linenum, line in enumerate(in_file):
        grammar_key = pattern.findall(line)
        if grammar_key != []:
            decoder = pocketsphinx.Decoder(config)
            ruleGrammar = jsgf.get_rule(
                ('structure.' + grammar_key[0]).format(grammar_path))
            fsgNext = jsgf.build_fsg(ruleGrammar, decoder.get_logmath(), 7.5)
            decoder.set_fsg(grammar_key[0], fsgNext)
            decoder.set_search(grammar_key[0])
            grammar_decoders.append(decoder)
    def __init__(self, **kwargs):
        signal.signal(signal.SIGINT, self.stop)

        model_path = get_model_path()

        kwargs = {
            x: os.path.expandvars(kwargs[x])
            if type(kwargs[x]) is str else kwargs[x]
            for x in kwargs
        }

        nodename = kwargs.pop('nodename')
        grammar_file = kwargs.pop('grammar_file', None)
        grammar_rule = kwargs.pop('grammar_rule', None)
        grammar_name = kwargs.pop('grammar_name', None)

        kwargs.pop('esiaf_input_topic')

        if kwargs.get('dic') is not None and kwargs.get('dict') is None:
            kwargs['dict'] = kwargs.pop('dic')

        if kwargs.get('hmm') is None:
            kwargs['hmm'] = os.path.join(model_path, 'en-us')

        if kwargs.get('lm') is None:
            kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin')

        if kwargs.get('dict') is None and kwargs.get('dic') is None:
            kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict')

        if kwargs.pop('verbose', False) is False:
            if sys.platform.startswith('win'):
                kwargs['logfn'] = 'nul'
            else:
                kwargs['logfn'] = '/dev/null'

        config = Decoder.default_config()

        print(kwargs)

        for key, value in kwargs.items():
            if isinstance(value, bool):
                config.set_boolean('-{}'.format(key), value)
            elif isinstance(value, int):
                config.set_int('-{}'.format(key), value)
            elif isinstance(value, float):
                config.set_float('-{}'.format(key), value)
            elif isinstance(value, str):
                config.set_string('-{}'.format(key), value)

        self.decoder = Decoder(config)

        if grammar_file and grammar_rule and grammar_name:
            jsgf = Jsgf(grammar_file)
            rule = jsgf.get_rule(grammar_name + '.' + grammar_rule)
            fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
            self.decoder.set_fsg(grammar_name, fsg)
            self.decoder.set_search(grammar_name)

        self.start = None
        self.finish = None

        self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec',
                                                SpeechInfo,
                                                queue_size=10)