def performSpeechRecognition(self, audio_file, audio_type, user): # Check if path exists if os.path.isfile(audio_file) == False: return [ "Error: Something went wrong with the local audio storage\ Requested path: " + audio_file ] # Keep extra audio files that need erasing audio_to_be_erased = [] # If it is an .ogg file (from NAO) recode it into .wav next_audio_file = audio_file prev_audio_file = next_audio_file audio_file_folder = os.path.dirname(audio_file) if audio_file_folder[-1] != "/": audio_file_folder += "/" # Check that the audio_type is legit if audio_type not in [\ "headset", \ "nao_ogg", \ "nao_wav_4_ch", \ "nao_wav_1_ch",\ "nao_wav_1_ch_denoised", \ "nao_wav_1_ch_only_sox", \ "nao_wav_1_ch_denoised_only_sox"\ ]: return ["Error: Audio source unrecognized"] # Get processing profile profile = self._createProcessingProfile(audio_type) transform_req = AudioProcessingTransformAudioSrvRequest() transform_req.source_type = audio_type transform_req.source_name = prev_audio_file transform_req.target_type = 'wav' # Check if sox_transform is needed if profile['sox_transform'] == True: next_audio_file += "_transformed.wav" transform_req.target_name = next_audio_file trans_response = self._audio_transform_srv(transform_req) if trans_response.error != 'success': return ['Audio transformation error: ' + error] #raise RappError( 'Audio transformation error: ' + error ) audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_channels_and_rate'] == True: next_audio_file += "_mono16k.wav" transform_req.target_name = next_audio_file transform_req.target_channels = 1 transform_req.target_rate = 16000 trans_response = self._audio_transform_srv(transform_req) if trans_response.error != 'success': return ['Audio transformation error: ' + error] #raise RappError( 'Audio transformation error: ' + error ) audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_denoising'] == True: next_audio_file = prev_audio_file + "_denoised.wav" den_request = AudioProcessingDenoiseSrvRequest() den_request.audio_file = prev_audio_file den_request.denoised_audio_file = next_audio_file den_request.audio_type = audio_type den_request.user = user den_request.scale = profile['sox_denoising_scale'] den_response = self._denoise_service(den_request) if den_response.success != "true": return ["Error:" + den_response.success] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['detect_silence'] == True: # Detect silence silence_req = AudioProcessingDetectSilenceSrvRequest() silence_req.audio_file = prev_audio_file silence_req.threshold = profile['detect_silence_threshold'] silence_res = self._detect_silence_service(silence_req) rapp_print("Silence detection results: " + str(silence_res)) if silence_res.silence == "true": return [ "Error: No speech detected. RSD = " + str(silence_res.level) ] tries = 0 while tries < 2: # Perform energy denoising as well if profile['energy_denoising'] == True: next_audio_file = prev_audio_file + "_energy_denoised.wav" dres = self._performEnergyDenoising(next_audio_file, prev_audio_file, \ profile['energy_denoising_init_scale'] + tries * 0.125) if dres != "true": return ["Error:" + dres] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file new_audio_file = next_audio_file words = self._callSphinxJava(new_audio_file) if self._sphinxDied == True: self._sphinxDied = False break if len(words) == 0 or (len(words) == 1 and words[0] == ""): tries += 1 else: break backup_directory = \ os.path.expanduser("~/rapp_platform_files/rapp_speech_recognition_sphinx4/")\ + user if not os.path.isdir(backup_directory): os.makedirs(backup_directory) # Keep the original file: command = "cp " + audio_file + " " + backup_directory + "/" + \ audio_file.split("/")[-1] com_res = os.system(command) if com_res != 0: return ["Error: Server cp malfunctioned"] for f in audio_to_be_erased: clean_file = f.split("/")[-1] command = "cp " + f + " " + backup_directory + \ "/" + clean_file os.system(command) if com_res != 0: return ["Error: Server cp malfunctioned"] for f in audio_to_be_erased: command = "rm " + f os.system(command) if com_res != 0: return ["Error: Server rm malfunctioned"] return words
def performSpeechRecognition(self, audio_file, audio_source, user): # Check if path exists if os.path.isfile(audio_file) == False: return ["Error: Something went wrong with the local audio storage\ Requested path: " + audio_file] # Keep extra audio files that need erasing audio_to_be_erased = [] # If it is an .ogg file (from NAO) recode it into .wav next_audio_file = audio_file prev_audio_file = next_audio_file audio_file_folder = os.path.dirname(audio_file) if audio_file_folder[-1] != "/": audio_file_folder += "/" # Check that the audio_source is legit if audio_source not in [\ "headset", \ "nao_ogg", \ "nao_wav_4_ch", \ "nao_wav_1_ch",\ "nao_wav_1_ch_denoised", \ "nao_wav_1_ch_only_sox", \ "nao_wav_1_ch_denoised_only_sox"\ ]: return ["Error: Audio source unrecognized"] # Get processing profile profile = self.createProcessingProfile(audio_source) transform_req = AudioProcessingTransformAudioSrvRequest() transform_req.source_type = audio_source transform_req.source_name = prev_audio_file transform_req.target_type = 'wav' # Check if sox_transform is needed if profile['sox_transform'] == True: next_audio_file += "_transformed.wav" transform_req.target_name = next_audio_file trans_response = self.audio_transform_srv( transform_req ) if trans_response.error != 'success': raise RappError( 'Audio transformation error: ' + error ) #command = "sox " + prev_audio_file + " " + next_audio_file #com_res = os.system(command) #if com_res != 0: #return ["Error: sox malfunctioned"] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_channels_and_rate'] == True: next_audio_file += "_mono16k.wav" transform_req.target_name = next_audio_file transform_req.target_channels = 1 transform_req.target_rate = 16000 trans_response = self.audio_transform_srv( transform_req ) if trans_response.error != 'success': raise RappError( 'Audio transformation error: ' + error ) #command = "sox " + prev_audio_file + " -r 16000 -c 1 " + next_audio_file #com_res = os.system(command) #if com_res != 0: #return ["Error: sox malfunctioned"] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_denoising'] == True: next_audio_file = prev_audio_file + "_denoised.wav" den_request = AudioProcessingDenoiseSrvRequest() den_request.audio_file = prev_audio_file den_request.denoised_audio_file = next_audio_file den_request.audio_type = audio_source den_request.user = user den_request.scale = profile['sox_denoising_scale'] den_response = self.denoise_service(den_request) if den_response.success != "true": return ["Error:" + den_response.success] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['detect_silence'] == True: # Detect silence silence_req = AudioProcessingDetectSilenceSrvRequest() silence_req.audio_file = prev_audio_file silence_req.threshold = profile['detect_silence_threshold'] silence_res = self.detect_silence_service(silence_req) rapp_print("Silence detection results: " + str(silence_res)) if silence_res.silence == "true": return ["Error: No speech detected. RSD = " + str(silence_res.level)] tries = 0 while tries < 2: # Perform energy denoising as well if profile['energy_denoising'] == True: next_audio_file = prev_audio_file + "_energy_denoised.wav" dres = self.performEnergyDenoising(next_audio_file, prev_audio_file, \ profile['energy_denoising_init_scale'] + tries * 0.125) if dres != "true": return ["Error:" + dres] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file new_audio_file = next_audio_file words = self.callSphinxJava(new_audio_file) if self.sphinxDied == True: self.sphinxDied = False break if len(words) == 0 or (len(words) == 1 and words[0] == ""): tries += 1 else: break backup_directory = \ os.path.expanduser("~/rapp_platform_files/rapp_speech_recognition_sphinx4/")\ + user if not os.path.isdir(backup_directory): os.makedirs(backup_directory) # Keep the original file: command = "cp " + audio_file + " " + backup_directory + "/" + \ audio_file.split("/")[-1] com_res = os.system(command) if com_res != 0: return ["Error: Server cp malfunctioned"] for f in audio_to_be_erased: clean_file = f.split("/")[-1] command = "cp " + f + " " + backup_directory + \ "/" + clean_file os.system(command) if com_res != 0: return ["Error: Server cp malfunctioned"] for f in audio_to_be_erased: command = "rm " + f os.system(command) if com_res != 0: return ["Error: Server rm malfunctioned"] return words
def speech_to_text(self, file_path, user, audio_file_type, language): # Check the user serv_db_topic = rospy.get_param("rapp_mysql_wrapper_user_fetch_data_topic") authentication_service = rospy.ServiceProxy(serv_db_topic, fetchDataSrv) req_db = fetchDataSrv() req_db.req_cols=["username"] entry1=["username", user] req_db.where_data=[StringArrayMsg(s=entry1)] resp = authentication_service(req_db.req_cols, req_db.where_data) if resp.success.data != True or len(resp.res_data) == 0: raise RappError("Non authenticated user") # Check if file exists if not os.path.isfile(file_path): raise RappError("Error: file " + file_path + ' not found') # Check if file is flac. If not convert it new_audio = file_path audio_trans_topic = rospy.get_param("rapp_audio_processing_transform_audio_topic") audio_transform_srv = rospy.ServiceProxy( audio_trans_topic, AudioProcessingTransformAudioSrv ) cleanup = [] transform_req = AudioProcessingTransformAudioSrvRequest() transform_req.source_type = audio_file_type transform_req.source_name = new_audio transform_req.target_type = 'wav' new_audio += '.wav' transform_req.target_name = new_audio transform_req.target_channels = 1 transform_req.target_rate = 16000 trans_response = audio_transform_srv( transform_req ) if trans_response.error != 'success': raise RappError( trans_response.error ) cleanup.append(new_audio) # Denoise if necessary prev_audio_file = new_audio next_audio_file = prev_audio_file if audio_file_type in ['nao_ogg', 'nao_wav_1_ch', 'nao_wav_4_ch']: denoise_topic = rospy.get_param("rapp_audio_processing_denoise_topic") energy_denoise_topic = \ rospy.get_param("rapp_audio_processing_energy_denoise_topic") denoise_service = rospy.ServiceProxy(\ denoise_topic, AudioProcessingDenoiseSrv) energy_denoise_service = rospy.ServiceProxy(\ energy_denoise_topic, AudioProcessingDenoiseSrv) manipulation = {} manipulation['sox_transform'] = False manipulation['sox_denoising'] = False manipulation['sox_channels_and_rate'] = False if audio_file_type == "headset": pass elif audio_file_type == "nao_ogg": manipulation['sox_transform'] = True manipulation['sox_denoising'] = True manipulation['sox_denoising_scale'] = 0.15 elif audio_file_type == "nao_wav_4_ch": manipulation['sox_channels_and_rate'] = True manipulation['sox_denoising'] = True manipulation['sox_denoising_scale'] = 0.15 elif audio_file_type == "nao_wav_1_ch": manipulation['sox_denoising'] = True manipulation['sox_denoising_scale'] = 0.15 manipulation['detect_silence'] = True manipulation['detect_silence_threshold'] = 0.25 # Check if sox_transform is needed if manipulation['sox_transform'] == True: next_audio_file += "_transformed.wav" command = "sox " + prev_audio_file + " " + next_audio_file com_res = os.system(command) if com_res != 0: raise RappError("Error: sox malfunctioned") cleanup.append(next_audio_file) prev_audio_file = next_audio_file if manipulation['sox_channels_and_rate'] == True: next_audio_file += "_mono16k.wav" command = "sox " + prev_audio_file + " -r 16000 -c 1 " + next_audio_file com_res = os.system(command) if com_res != 0: raise RappError("Error: sox malfunctioned") cleanup.append(next_audio_file) prev_audio_file = next_audio_file if manipulation['sox_denoising'] == True: next_audio_file = prev_audio_file + "_denoised.wav" den_request = AudioProcessingDenoiseSrvRequest() den_request.audio_file = prev_audio_file den_request.denoised_audio_file = next_audio_file den_request.audio_type = audio_file_type den_request.user = user den_request.scale = manipulation['sox_denoising_scale'] den_response = denoise_service(den_request) if den_response.success != "true": raise RappError("Error:" + den_response.success) cleanup.append(next_audio_file) prev_audio_file = next_audio_file # must implement a fallback function to clear redundant files # Transform to flac transform_req = AudioProcessingTransformAudioSrvRequest() transform_req.source_type = 'headset' transform_req.source_name = new_audio transform_req.target_type = 'flac' newer_audio = new_audio + '.flac' transform_req.target_name = newer_audio transform_req.target_channels = 1 transform_req.target_rate = 16000 trans_response = audio_transform_srv( transform_req ) cleanup.append(newer_audio) if trans_response.error != 'success': raise RappError( trans_response.error ) # Open the file with open(newer_audio, "r") as f: speech = f.read() url = "www.google.com" # Fix language if language == 'en': language = "en-US" elif language == 'gr': language = 'el' #NOTE - Thats a general usage key. They may disable it in the future. key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" path = "/speech-api/v2/recognize?lang=" + language + "&key=" + key headers = { "Content-type": "audio/x-flac; rate=22050" }; params = {"xjerr": "1", "client": "chromium"} conn = httplib.HTTPSConnection(url) conn.request("POST", path, speech, headers) response = conn.getresponse() data = response.read() initial_data = data # Google returns one empty result for some reason here. Removing it.. index = data.find("}") data = data[index + 1:] if data == '\n': # Returned nothing.. something went wrong data = initial_data jsdata = json.loads(data) # Remove the flac if needed for f in cleanup: command = 'rm -f ' + f if os.system(command): raise RappError("Error: Removal of temporary file malfunctioned") return jsdata
def performSpeechRecognition(self, audio_file, audio_type, user): # Check if path exists if os.path.isfile(audio_file) == False: return ["Error: Something went wrong with the local audio storage\ Requested path: " + audio_file] # Keep extra audio files that need erasing audio_to_be_erased = [] # If it is an .ogg file (from NAO) recode it into .wav next_audio_file = audio_file prev_audio_file = next_audio_file audio_file_folder = os.path.dirname(audio_file) if audio_file_folder[-1] != "/": audio_file_folder += "/" # Check that the audio_type is legit if audio_type not in [\ "headset", \ "nao_ogg", \ "nao_wav_4_ch", \ "nao_wav_1_ch",\ "nao_wav_1_ch_denoised", \ "nao_wav_1_ch_only_sox", \ "nao_wav_1_ch_denoised_only_sox"\ ]: return ["Error: Audio source unrecognized"] # Get processing profile profile = self._createProcessingProfile(audio_type) transform_req = AudioProcessingTransformAudioSrvRequest() transform_req.source_type = audio_type transform_req.source_name = prev_audio_file transform_req.target_type = 'wav' # Check if sox_transform is needed if profile['sox_transform'] == True: next_audio_file += "_transformed.wav" transform_req.target_name = next_audio_file trans_response = self._audio_transform_srv( transform_req ) if trans_response.error != 'success': return [ 'Audio transformation error: ' + trans_response.error ] #raise RappError( 'Audio transformation error: ' + error ) audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_channels_and_rate'] == True: next_audio_file += "_mono16k.wav" transform_req.target_name = next_audio_file transform_req.target_channels = 1 transform_req.target_rate = 16000 trans_response = self._audio_transform_srv( transform_req ) if trans_response.error != 'success': return [ 'Audio transformation error: ' + trans_response.error ] #raise RappError( 'Audio transformation error: ' + error ) audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['sox_denoising'] == True: next_audio_file = prev_audio_file + "_denoised.wav" den_request = AudioProcessingDenoiseSrvRequest() den_request.audio_file = prev_audio_file den_request.denoised_audio_file = next_audio_file den_request.audio_type = audio_type den_request.user = user den_request.scale = profile['sox_denoising_scale'] den_response = self._denoise_service(den_request) if den_response.success != "true": return ["Error:" + den_response.success] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file if profile['detect_silence'] == True: # Detect silence silence_req = AudioProcessingDetectSilenceSrvRequest() silence_req.audio_file = prev_audio_file silence_req.threshold = profile['detect_silence_threshold'] silence_res = self._detect_silence_service(silence_req) RappUtilities.rapp_print("Silence detection results: " + str(silence_res)) if silence_res.silence == "true": return ["Error: No speech detected. RSD = " + str(silence_res.level)] tries = 0 while tries < 2: # Perform energy denoising as well if profile['energy_denoising'] == True: next_audio_file = prev_audio_file + "_energy_denoised.wav" dres = self._performEnergyDenoising(next_audio_file, prev_audio_file, \ profile['energy_denoising_init_scale'] + tries * 0.125) if dres != "true": return ["Error:" + dres] audio_to_be_erased.append(next_audio_file) prev_audio_file = next_audio_file new_audio_file = next_audio_file words = self._callSphinxJava(new_audio_file) if self._sphinxDied == True: self._sphinxDied = False break if len(words) == 0 or (len(words) == 1 and words[0] == ""): tries += 1 else: break for f in audio_to_be_erased: try: os.remove(f) except OSError as e: return ["Error: Server rm malfunctioned"] return words