def recognize(self, audio_data, keyword_entries=None, grammar=None): language = self.lang assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" assert isinstance(language, str), "``language`` must be a string" assert keyword_entries is None or all( isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or" \ " a list of pairs of strings and " \ "numbers between 0 and 1" # obtain audio data raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # obtain recognition results if keyword_entries is not None: # explicitly specified set of keywords with PortableNamedTemporaryFile("w") as f: # generate a keywords file f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) f.flush() # perform the speech recognition with the keywords file self.decoder.set_kws("keywords", f.name) self.decoder.set_search("keywords") self.decoder.start_utt() # begin utterance processing self.decoder.process_raw(raw_data, False, True) self.decoder.end_utt() # stop utterance processing elif grammar is not None: # a path to a FSG or JSGF grammar if not os.path.exists(grammar): raise ValueError( "Grammar '{0}' does not exist.".format(grammar)) grammar_path = os.path.abspath(os.path.dirname(grammar)) grammar_name = os.path.splitext(os.path.basename(grammar))[0] fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) if not os.path.exists( fsg_path): # create FSG grammar if not available jsgf = Jsgf(grammar) rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) fsg.writefile(fsg_path) else: fsg = FsgModel(fsg_path, self.decoder.get_logmath(), 7.5) self.decoder.set_fsg(grammar_name, fsg) self.decoder.set_search(grammar_name) self.decoder.start_utt() self.decoder.process_raw(raw_data, False, True) self.decoder.end_utt() # stop utterance processing else: # no keywords, perform freeform recognition self.decoder.start_utt() # begin utterance processing self.decoder.process_raw(raw_data, False, True) self.decoder.end_utt() # stop utterance processing # return results hypothesis = self.decoder.hyp() if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available
def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo', lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin', dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic', grammar='data/gramatica-tp2.gram', dataPath='tmp/'): self.data_path = dataPath config = { 'hmm': hmm, 'lm': lm, 'dict': dict } #model_path = get_model_path() self.ps = Pocketsphinx(**config) # Switch to JSGF grammar jsgf = Jsgf(grammar) rule = jsgf.get_rule('tp2.grammar') fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5) self.ps.set_fsg('tp2', fsg) self.ps.set_search('tp2') # Síntesis self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J') self.tts = TextToSpeechV1(authenticator=self.tts_authenticator) self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api')
def test_jsgf(self): ps = Pocketsphinx(lm='deps/pocketsphinx/test/data/turtle.lm.bin', dic='deps/pocketsphinx/test/data/turtle.dic') # Decoding with 'turtle' language model ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters') # Switch to JSGF grammar jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram') rule = jsgf.get_rule('goforward.move2') fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5) ps.set_fsg('goforward', fsg) ps.set_search('goforward') # Decoding with 'goforward' grammar ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters')
def test_jsgf(self): ps = Pocketsphinx( lm='deps/pocketsphinx/test/data/turtle.lm.bin', dic='deps/pocketsphinx/test/data/turtle.dic' ) # Decoding with 'turtle' language model ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters') # Switch to JSGF grammar jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram') rule = jsgf.get_rule('goforward.move2') fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5) ps.set_fsg('goforward', fsg) ps.set_search('goforward') # Decoding with 'goforward' grammar ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters')
def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. """ assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``" assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" # import the PocketSphinx speech recognition module try: from pocketsphinx import pocketsphinx, Jsgf, FsgModel except ImportError: raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") except ValueError: raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.") if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"): raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.") if isinstance(language, str): # directory containing language data language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") else: # 3-tuple of Sphinx data file paths acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language if not os.path.isdir(acoustic_parameters_directory): raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) if not os.path.isfile(language_model_file): raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) if not os.path.isfile(phoneme_dictionary_file): raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) decoder = pocketsphinx.Decoder(config) # obtain audio data raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format # obtain recognition results if keyword_entries is not None: # explicitly specified set of keywords with PortableNamedTemporaryFile("w") as f: # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) f.flush() # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) decoder.set_kws("keywords", f.name) decoder.set_search("keywords") decoder.start_utt() # begin utterance processing decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing elif grammar is not None: # a path to a FSG or JSGF grammar if not os.path.exists(grammar): raise ValueError("Grammar '{0}' does not exist.".format(grammar)) grammar_path = os.path.abspath(os.path.dirname(grammar)) grammar_name = os.path.splitext(os.path.basename(grammar))[0] fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) if not os.path.exists(fsg_path): # create FSG grammar if not available jsgf = Jsgf(grammar) rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) fsg.writefile(fsg_path) else: fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) decoder.set_fsg(grammar_name, fsg) decoder.set_search(grammar_name) decoder.start_utt() decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing else: # no keywords, perform freeform recognition decoder.start_utt() # begin utterance processing decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing if show_all: return decoder # return results hypothesis = decoder.hyp() if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available
config.set_string('-lm', model_dir + '/language-model.bin') config.set_string('-dict', model_dir + '/pronounciation-dictionary.dict') config.set_string("-logfn", os.devnull) jsgf = Jsgf(grammar_path) grammar_decoders = [] pattern = re.compile('public <(.*?)> =') with open(grammar_path, 'rt') as in_file: for linenum, line in enumerate(in_file): grammar_key = pattern.findall(line) if grammar_key != []: decoder = pocketsphinx.Decoder(config) ruleGrammar = jsgf.get_rule( ('structure.' + grammar_key[0]).format(grammar_path)) fsgNext = jsgf.build_fsg(ruleGrammar, decoder.get_logmath(), 7.5) decoder.set_fsg(grammar_key[0], fsgNext) decoder.set_search(grammar_key[0]) grammar_decoders.append(decoder) class Text2Speech: CHANNEL = 'text2speech' CHANNEL_TYPE = 'brain' @staticmethod def id(): return Sense.id(__class__) @staticmethod
def __init__(self, **kwargs): signal.signal(signal.SIGINT, self.stop) model_path = get_model_path() kwargs = { x: os.path.expandvars(kwargs[x]) if type(kwargs[x]) is str else kwargs[x] for x in kwargs } nodename = kwargs.pop('nodename') grammar_file = kwargs.pop('grammar_file', None) grammar_rule = kwargs.pop('grammar_rule', None) grammar_name = kwargs.pop('grammar_name', None) kwargs.pop('esiaf_input_topic') if kwargs.get('dic') is not None and kwargs.get('dict') is None: kwargs['dict'] = kwargs.pop('dic') if kwargs.get('hmm') is None: kwargs['hmm'] = os.path.join(model_path, 'en-us') if kwargs.get('lm') is None: kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin') if kwargs.get('dict') is None and kwargs.get('dic') is None: kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict') if kwargs.pop('verbose', False) is False: if sys.platform.startswith('win'): kwargs['logfn'] = 'nul' else: kwargs['logfn'] = '/dev/null' config = Decoder.default_config() print(kwargs) for key, value in kwargs.items(): if isinstance(value, bool): config.set_boolean('-{}'.format(key), value) elif isinstance(value, int): config.set_int('-{}'.format(key), value) elif isinstance(value, float): config.set_float('-{}'.format(key), value) elif isinstance(value, str): config.set_string('-{}'.format(key), value) self.decoder = Decoder(config) if grammar_file and grammar_rule and grammar_name: jsgf = Jsgf(grammar_file) rule = jsgf.get_rule(grammar_name + '.' + grammar_rule) fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) self.decoder.set_fsg(grammar_name, fsg) self.decoder.set_search(grammar_name) self.start = None self.finish = None self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec', SpeechInfo, queue_size=10)