Exemplo n.º 1
0
    def performSpeechRecognition(self, audio_file, audio_type, user):
        # Check if path exists
        if os.path.isfile(audio_file) == False:
            return [
                "Error: Something went wrong with the local audio storage\
              Requested path: " + audio_file
            ]

        # Keep extra audio files that need erasing
        audio_to_be_erased = []

        # If it is an .ogg file (from NAO) recode it into .wav
        next_audio_file = audio_file
        prev_audio_file = next_audio_file

        audio_file_folder = os.path.dirname(audio_file)
        if audio_file_folder[-1] != "/":
            audio_file_folder += "/"

        # Check that the audio_type is legit
        if audio_type not in [\
            "headset", \
            "nao_ogg", \
            "nao_wav_4_ch", \
            "nao_wav_1_ch",\
            "nao_wav_1_ch_denoised", \
            "nao_wav_1_ch_only_sox", \
            "nao_wav_1_ch_denoised_only_sox"\
            ]:
            return ["Error: Audio source unrecognized"]

        # Get processing profile
        profile = self._createProcessingProfile(audio_type)

        transform_req = AudioProcessingTransformAudioSrvRequest()
        transform_req.source_type = audio_type
        transform_req.source_name = prev_audio_file
        transform_req.target_type = 'wav'

        # Check if sox_transform is needed
        if profile['sox_transform'] == True:
            next_audio_file += "_transformed.wav"
            transform_req.target_name = next_audio_file

            trans_response = self._audio_transform_srv(transform_req)

            if trans_response.error != 'success':
                return ['Audio transformation error: ' + error]
                #raise RappError( 'Audio transformation error: ' + error )

            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['sox_channels_and_rate'] == True:
            next_audio_file += "_mono16k.wav"
            transform_req.target_name = next_audio_file
            transform_req.target_channels = 1
            transform_req.target_rate = 16000

            trans_response = self._audio_transform_srv(transform_req)

            if trans_response.error != 'success':
                return ['Audio transformation error: ' + error]
                #raise RappError( 'Audio transformation error: ' + error )
            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['sox_denoising'] == True:
            next_audio_file = prev_audio_file + "_denoised.wav"
            den_request = AudioProcessingDenoiseSrvRequest()
            den_request.audio_file = prev_audio_file
            den_request.denoised_audio_file = next_audio_file
            den_request.audio_type = audio_type
            den_request.user = user
            den_request.scale = profile['sox_denoising_scale']
            den_response = self._denoise_service(den_request)
            if den_response.success != "true":
                return ["Error:" + den_response.success]
            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['detect_silence'] == True:
            # Detect silence
            silence_req = AudioProcessingDetectSilenceSrvRequest()
            silence_req.audio_file = prev_audio_file
            silence_req.threshold = profile['detect_silence_threshold']
            silence_res = self._detect_silence_service(silence_req)
            rapp_print("Silence detection results: " + str(silence_res))
            if silence_res.silence == "true":
                return [
                    "Error: No speech detected. RSD = " +
                    str(silence_res.level)
                ]

        tries = 0
        while tries < 2:
            # Perform energy denoising as well
            if profile['energy_denoising'] == True:
                next_audio_file = prev_audio_file + "_energy_denoised.wav"
                dres = self._performEnergyDenoising(next_audio_file, prev_audio_file, \
                        profile['energy_denoising_init_scale'] + tries * 0.125)
                if dres != "true":
                    return ["Error:" + dres]
                audio_to_be_erased.append(next_audio_file)
                prev_audio_file = next_audio_file

            new_audio_file = next_audio_file
            words = self._callSphinxJava(new_audio_file)
            if self._sphinxDied == True:
                self._sphinxDied = False
                break

            if len(words) == 0 or (len(words) == 1 and words[0] == ""):
                tries += 1
            else:
                break

        backup_directory = \
            os.path.expanduser("~/rapp_platform_files/rapp_speech_recognition_sphinx4/")\
            + user
        if not os.path.isdir(backup_directory):
            os.makedirs(backup_directory)

        # Keep the original file:
        command = "cp " + audio_file + " " + backup_directory + "/" + \
                audio_file.split("/")[-1]
        com_res = os.system(command)
        if com_res != 0:
            return ["Error: Server cp malfunctioned"]

        for f in audio_to_be_erased:
            clean_file = f.split("/")[-1]
            command = "cp " + f + " " + backup_directory + \
                "/" + clean_file
            os.system(command)
            if com_res != 0:
                return ["Error: Server cp malfunctioned"]

        for f in audio_to_be_erased:
            command = "rm " + f
            os.system(command)
            if com_res != 0:
                return ["Error: Server rm malfunctioned"]

        return words
  def speech_to_text(self, file_path, user, audio_file_type, language):

    # Check the user
    serv_db_topic = rospy.get_param("rapp_mysql_wrapper_user_fetch_data_topic")
    authentication_service = rospy.ServiceProxy(serv_db_topic, fetchDataSrv)
    req_db = fetchDataSrv()
    req_db.req_cols=["username"]
    entry1=["username", user]
    req_db.where_data=[StringArrayMsg(s=entry1)]

    resp = authentication_service(req_db.req_cols, req_db.where_data)
    if resp.success.data != True or len(resp.res_data) == 0:
      raise RappError("Non authenticated user")

    # Check if file exists
    if not os.path.isfile(file_path):
        raise RappError("Error: file " + file_path + ' not found')

    # Check if file is flac. If not convert it
    new_audio = file_path

    audio_trans_topic = rospy.get_param("rapp_audio_processing_transform_audio_topic")
    audio_transform_srv = rospy.ServiceProxy( audio_trans_topic, AudioProcessingTransformAudioSrv )

    cleanup = []

    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = audio_file_type
    transform_req.source_name = new_audio
    transform_req.target_type = 'wav'
    new_audio += '.wav'
    transform_req.target_name = new_audio
    transform_req.target_channels = 1
    transform_req.target_rate = 16000

    trans_response = audio_transform_srv( transform_req )

    if trans_response.error != 'success':
        raise RappError( trans_response.error )
    cleanup.append(new_audio)

    # Denoise if necessary
    prev_audio_file = new_audio
    next_audio_file = prev_audio_file
    if audio_file_type in ['nao_ogg', 'nao_wav_1_ch', 'nao_wav_4_ch']:
        denoise_topic = rospy.get_param("rapp_audio_processing_denoise_topic")
        energy_denoise_topic = \
            rospy.get_param("rapp_audio_processing_energy_denoise_topic")
        denoise_service = rospy.ServiceProxy(\
            denoise_topic, AudioProcessingDenoiseSrv)
        energy_denoise_service = rospy.ServiceProxy(\
            energy_denoise_topic, AudioProcessingDenoiseSrv)

        manipulation = {}
        manipulation['sox_transform'] = False
        manipulation['sox_denoising'] = False
        manipulation['sox_channels_and_rate'] = False
        if audio_file_type == "headset":
            pass
        elif audio_file_type == "nao_ogg":
            manipulation['sox_transform'] = True
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
        elif audio_file_type == "nao_wav_4_ch":
            manipulation['sox_channels_and_rate'] = True
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
        elif audio_file_type == "nao_wav_1_ch":
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
            manipulation['detect_silence'] = True
            manipulation['detect_silence_threshold'] = 0.25

        # Check if sox_transform is needed
        if manipulation['sox_transform'] == True:
            next_audio_file += "_transformed.wav"
            command = "sox " + prev_audio_file + " " + next_audio_file
            com_res = os.system(command)
            if com_res != 0:
                raise RappError("Error: sox malfunctioned")
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file
        if manipulation['sox_channels_and_rate'] == True:
            next_audio_file += "_mono16k.wav"
            command = "sox " + prev_audio_file + " -r 16000 -c 1 " + next_audio_file
            com_res = os.system(command)
            if com_res != 0:
                raise RappError("Error: sox malfunctioned")
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file
        if manipulation['sox_denoising'] == True:
            next_audio_file = prev_audio_file + "_denoised.wav"
            den_request = AudioProcessingDenoiseSrvRequest()
            den_request.audio_file = prev_audio_file
            den_request.denoised_audio_file = next_audio_file
            den_request.audio_type = audio_file_type
            den_request.user = user
            den_request.scale = manipulation['sox_denoising_scale']
            den_response = denoise_service(den_request)
            if den_response.success != "true":
                raise RappError("Error:" + den_response.success)
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file

            # must implement a fallback function to clear redundant files

    # Transform to flac
    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = 'headset'
    transform_req.source_name = new_audio
    transform_req.target_type = 'flac'
    newer_audio = new_audio + '.flac'
    transform_req.target_name = newer_audio
    transform_req.target_channels = 1
    transform_req.target_rate = 16000

    trans_response = audio_transform_srv( transform_req )
    cleanup.append(newer_audio)

    if trans_response.error != 'success':
        raise RappError( trans_response.error )


    # Open the file
    with open(newer_audio, "r") as f:
      speech = f.read()
    url = "www.google.com"

    # Fix language
    if language == 'en':
        language = "en-US"
    elif language == 'gr':
        language = 'el'

    #NOTE - Thats a general usage key. They may disable it in the future.
    key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
    path = "/speech-api/v2/recognize?lang=" + language + "&key=" + key
    headers = { "Content-type": "audio/x-flac; rate=22050" };
    params = {"xjerr": "1", "client": "chromium"}
    conn = httplib.HTTPSConnection(url)
    conn.request("POST", path, speech, headers)
    response = conn.getresponse()
    data = response.read()
    initial_data = data
    # Google returns one empty result for some reason here. Removing it..
    index = data.find("}")
    data = data[index + 1:]
    if data == '\n':
        # Returned nothing.. something went wrong
        data = initial_data
    jsdata = json.loads(data)

    # Remove the flac if needed
    for f in cleanup:
        command = 'rm -f ' + f
        if os.system(command):
            raise RappError("Error: Removal of temporary file malfunctioned")
    return jsdata