def process_result(self, data): results = {} for shypo in data.xpath("//SHYPO"): transcript = [] confidence = 0.0 for whypo in shypo.xpath("./WHYPO"): word = whypo.attrib["WORD"].encode(self.encoding) if word.startswith("<"): continue transcript.append(word) confidence += float(whypo.attrib["CM"]) confidence /= len(transcript) transcript = " ".join(transcript) results[confidence] = transcript msg = SpeechRecognitionCandidates() debug_str = ["Recognized:"] for i, result in enumerate(sorted(results.items(), reverse=True)): c, t = result debug_str += ["%d: %s (%.2f)" % (i+1, t, c)] msg.transcript.append(t) msg.confidence.append(c) self.pub_speech_recognition.publish(msg) self.last_speech = msg rospy.logdebug(os.linesep.join(debug_str))
def __init__(self): # load parameters self.encoding = rospy.get_param("~encoding", "utf-8") self.default_duration = rospy.get_param("~duration", 3.0) self.default_threshold = rospy.get_param("~threshold", 0.8) self.use_isolated_word = rospy.get_param("~use_isolated_word", True) self.start_signal_action_timeout = rospy.get_param( "~start_signal_action_timeout", 0.3) self.pub_speech_recognition = rospy.Publisher( "speech_to_text", SpeechRecognitionCandidates, queue_size=1) # initialize sound play self.act_sound = actionlib.SimpleActionClient("sound_play", SoundRequestAction) if not self.act_sound.wait_for_server(rospy.Duration(5.0)): rospy.logwarn( "Failed to find sound_play action. Disabled audio alert") self.act_sound = None # setup julius host = rospy.get_param("~host", "localhost") module_port = rospy.get_param("~module_port", 10500) audio_port = rospy.get_param("~audio_port", 10501) max_retry = rospy.get_param("~max_connection_retry", 0) self.module = ModuleClient(host, module_port, max_retry, self.encoding) self.audio = AudioTransport(host, audio_port, max_retry, "audio") rospy.on_shutdown(self.shutdown_cb) self.module.on_received_data(self.julius_cb) self.module.start() self.audio.start() self.status() self.start() # start subscribe self.lock = Lock() self.last_speech = SpeechRecognitionCandidates() self.vocabularies = {} self.srv_show_status = rospy.Service("show_julius_status", Empty, self.status) if self.use_isolated_word: self.sub_vocabulary = rospy.Subscriber("vocabulary", Vocabulary, self.vocabulary_cb) else: self.sub_grammar = rospy.Subscriber("grammar", Grammar, self.grammar_cb) self.srv_speech_recognition = rospy.Service("speech_recognition", SpeechRecognition, self.speech_recognition_cb)
def speech_recognition_srv_cb(self, req): res = SpeechRecognitionResponse() duration = req.duration if duration <= 0.0: duration = self.default_duration with self.audio as src: if self.dynamic_energy_threshold: self.recognizer.adjust_for_ambient_noise(src) rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold) if not req.quiet: self.play_sound("start", 0.1) start_time = rospy.Time.now() while (rospy.Time.now() - start_time).to_sec() < duration: rospy.loginfo("Waiting for speech...") try: audio = self.recognizer.listen( src, timeout=self.listen_timeout, phrase_time_limit=self.phrase_time_limit) except SR.WaitTimeoutError as e: rospy.logwarn(e) break if not req.quiet: self.play_sound("recognized", 0.05) rospy.loginfo("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data())) try: result = self.recognize(audio) rospy.loginfo("Result: %s" % result.encode('utf-8')) if not req.quiet: self.play_sound("success", 0.1) res.result = SpeechRecognitionCandidates( transcript=[result]) return res except SR.UnknownValueError: if self.dynamic_energy_threshold: self.recognizer.adjust_for_ambient_noise(src) rospy.loginfo("Set minimum energy threshold to %f" % self.recognizer.energy_threshold) except SR.RequestError as e: rospy.logerr("Failed to recognize: %s" % str(e)) rospy.sleep(0.1) if rospy.is_shutdown(): break # Timeout if not req.quiet: self.play_sound("timeout", 0.1) return res
def audio_cb(self, msg): if self.is_canceling: rospy.loginfo("Speech is cancelled") return data = SR.AudioData(msg.data, self.sample_rate, self.sample_width) try: if self.asr_engine == "google_legacy_single_utterance": rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_google( data, language=self.language) msg = SpeechRecognitionCandidates(transcript=[result]) rospy.loginfo("Got result: %s" % msg) self.pub_speech.publish(msg) elif self.asr_engine == "google_cloud_single_utterance": rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_google_cloud( data, language=self.language) msg = SpeechRecognitionCandidates(transcript=[result]) rospy.loginfo("Got result {}: {}".format(len(data.get_raw_data()), msg)) self.pub_speech.publish(msg) elif self.asr_engine == "ibm_single_utterance": rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_ibm_single_utterance( data, key=os.environ['IBM_API'], language=self.language ) #url=os.environ['IBM_URL']) #do not forget /v1/recognize ending msg = SpeechRecognitionCandidates(transcript=[result]) rospy.loginfo("Got result {}: {}".format(len(data.get_raw_data()), msg)) self.pub_speech.publish(msg) else: rospy.logerr("No valid Recognizer was set (in the roslaunch file): %s" % self.asr_engine) except SR.UnknownValueError as e: rospy.logerr("Failed to recognize: %s" % str(e)) except SR.RequestError as e: rospy.logerr("Failed to recognize: %s" % str(e))
def audio_cb(self, msg): if self.is_canceling: rospy.loginfo("Speech is cancelled") return data = SR.AudioData(msg.data, self.sample_rate, self.sample_width) try: rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.recognize_google(data, language=self.language) msg = SpeechRecognitionCandidates(transcript=[result]) self.pub_speech.publish(msg) except SR.UnknownValueError as e: rospy.logerr("Failed to recognize: %s" % str(e)) except SR.RequestError as e: rospy.logerr("Failed to recognize: %s" % str(e))
def audio_cb(self, _, audio): try: rospy.logdebug("Waiting for result... (Sent %d bytes)" % len(audio.get_raw_data())) result = self.recognize(audio) rospy.loginfo("Result: %s" % result.encode('utf-8')) msg = SpeechRecognitionCandidates(transcript=[result]) self.pub.publish(msg) except SR.UnknownValueError as e: if self.dynamic_energy_threshold: self.recognizer.adjust_for_ambient_noise(self.audio) rospy.loginfo("Updated energy threshold to %f" % self.recognizer.energy_threshold) except SR.RequestError as e: rospy.logerr("Failed to recognize: %s" % str(e))
def speech_recognition_cb(self, req): res = SpeechRecognitionResponse() # change grammar candidate_words = [] if req.grammar_name: ok = self.activate_gram(req.grammar_name) if not ok: rospy.logerr("failed to activate grammar %s" % req.grammar_name) return res if req.grammar_name in self.vocabularies: candidate_words = self.vocabularies[req.grammar_name] elif req.grammar.rules: g = req.grammar if not g.name: g.name = 'unknown' grammar = make_grammar_from_rules(g.rules) voca = make_voca_from_categories(g.categories, g.vocabularies) result = make_dfa(grammar, voca) if result is None: rospy.logerr("Failed to make dfa from grammar message") return res dfa, dic = result ok = self.change_gram(g.name, dfa.split(os.linesep), dic.split(os.linesep)) if not ok: rospy.logerr("Failed to change grammar") return res self.vocabularies[g.name] = list(set(e for v in msg.vocabularies for e in v.words)) candidate_words = self.vocabularies[g.name] elif req.vocabulary.words: v = req.vocabulary if not v.name: v.name = 'unknown' if len(v.phonemes) == 0 or not v.phonemes[0]: v.phonemes = make_phonemes_from_words(v.words) dic = [" %s\t%s" % (w, p) for w, p in zip(v.words, v.phonemes)] ok = self.change_gram(v.name, None, dic) if not ok: rospy.logerr("Failed to change vocabulary") return res self.vocabularies[v.name] = list(set(v.words)) candidate_words = self.vocabularies[v.name] else: rospy.logerr("Invalid request: 'grammar_name', 'grammar' or 'vocabulary' must be filled") return res duration = req.duration if duration <= 0.0: duration = self.default_duration threshold = req.threshold if threshold <= 0.0 or threshold > 1.0: threshold = self.default_threshold if not req.quiet: self.play_sound(self.start_signal, self.start_signal_action_timeout) start_time = rospy.Time.now() self.last_speech = SpeechRecognitionCandidates() while (rospy.Time.now() - start_time).to_sec() < duration: rospy.sleep(0.1) speech = self.last_speech if not self.last_speech.transcript: continue if candidate_words: ok = speech.transcript[0] in candidate_words and speech.confidence[0] >= threshold else: ok = speech.confidence[0] >= threshold if ok: t0 = speech.transcript[0] c0 = speech.confidence[0] rospy.loginfo("Recognized %s (%f)..." % (t0, c0)) if not req.quiet: self.play_sound(self.success_signal, 0.1) res.result = speech return res # timeout rospy.logerr("Timed out") if not req.quiet: self.play_sound(self.timeout_signal, 0.1) return res
def androidCallback(self, msg): self.pub.publish(SpeechRecognitionCandidates(transcript=msg.texts))
#!/usr/bin/env python import rospy from std_msgs.msg import String from speech_recognition_msgs.msg import SpeechRecognitionCandidates if __name__ == "__main__": text = SpeechRecognitionCandidates() text.transcript.append("th\u1EDDi ti\u1EBFt") text.confidence.append(1) rospy.init_node("fake_speech") speech = rospy.Publisher('/speech_to_text', SpeechRecognitionCandidates, queue_size=2) rate = rospy.Rate(10) while not rospy.is_shutdown(): speech.publish(text) rate.sleep()