示例#1
0
    def performSpeechRecognition(self, audio_file, audio_type, user):
        # Check if path exists
        if os.path.isfile(audio_file) == False:
            return [
                "Error: Something went wrong with the local audio storage\
              Requested path: " + audio_file
            ]

        # Keep extra audio files that need erasing
        audio_to_be_erased = []

        # If it is an .ogg file (from NAO) recode it into .wav
        next_audio_file = audio_file
        prev_audio_file = next_audio_file

        audio_file_folder = os.path.dirname(audio_file)
        if audio_file_folder[-1] != "/":
            audio_file_folder += "/"

        # Check that the audio_type is legit
        if audio_type not in [\
            "headset", \
            "nao_ogg", \
            "nao_wav_4_ch", \
            "nao_wav_1_ch",\
            "nao_wav_1_ch_denoised", \
            "nao_wav_1_ch_only_sox", \
            "nao_wav_1_ch_denoised_only_sox"\
            ]:
            return ["Error: Audio source unrecognized"]

        # Get processing profile
        profile = self._createProcessingProfile(audio_type)

        transform_req = AudioProcessingTransformAudioSrvRequest()
        transform_req.source_type = audio_type
        transform_req.source_name = prev_audio_file
        transform_req.target_type = 'wav'

        # Check if sox_transform is needed
        if profile['sox_transform'] == True:
            next_audio_file += "_transformed.wav"
            transform_req.target_name = next_audio_file

            trans_response = self._audio_transform_srv(transform_req)

            if trans_response.error != 'success':
                return ['Audio transformation error: ' + error]
                #raise RappError( 'Audio transformation error: ' + error )

            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['sox_channels_and_rate'] == True:
            next_audio_file += "_mono16k.wav"
            transform_req.target_name = next_audio_file
            transform_req.target_channels = 1
            transform_req.target_rate = 16000

            trans_response = self._audio_transform_srv(transform_req)

            if trans_response.error != 'success':
                return ['Audio transformation error: ' + error]
                #raise RappError( 'Audio transformation error: ' + error )
            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['sox_denoising'] == True:
            next_audio_file = prev_audio_file + "_denoised.wav"
            den_request = AudioProcessingDenoiseSrvRequest()
            den_request.audio_file = prev_audio_file
            den_request.denoised_audio_file = next_audio_file
            den_request.audio_type = audio_type
            den_request.user = user
            den_request.scale = profile['sox_denoising_scale']
            den_response = self._denoise_service(den_request)
            if den_response.success != "true":
                return ["Error:" + den_response.success]
            audio_to_be_erased.append(next_audio_file)
            prev_audio_file = next_audio_file
        if profile['detect_silence'] == True:
            # Detect silence
            silence_req = AudioProcessingDetectSilenceSrvRequest()
            silence_req.audio_file = prev_audio_file
            silence_req.threshold = profile['detect_silence_threshold']
            silence_res = self._detect_silence_service(silence_req)
            rapp_print("Silence detection results: " + str(silence_res))
            if silence_res.silence == "true":
                return [
                    "Error: No speech detected. RSD = " +
                    str(silence_res.level)
                ]

        tries = 0
        while tries < 2:
            # Perform energy denoising as well
            if profile['energy_denoising'] == True:
                next_audio_file = prev_audio_file + "_energy_denoised.wav"
                dres = self._performEnergyDenoising(next_audio_file, prev_audio_file, \
                        profile['energy_denoising_init_scale'] + tries * 0.125)
                if dres != "true":
                    return ["Error:" + dres]
                audio_to_be_erased.append(next_audio_file)
                prev_audio_file = next_audio_file

            new_audio_file = next_audio_file
            words = self._callSphinxJava(new_audio_file)
            if self._sphinxDied == True:
                self._sphinxDied = False
                break

            if len(words) == 0 or (len(words) == 1 and words[0] == ""):
                tries += 1
            else:
                break

        backup_directory = \
            os.path.expanduser("~/rapp_platform_files/rapp_speech_recognition_sphinx4/")\
            + user
        if not os.path.isdir(backup_directory):
            os.makedirs(backup_directory)

        # Keep the original file:
        command = "cp " + audio_file + " " + backup_directory + "/" + \
                audio_file.split("/")[-1]
        com_res = os.system(command)
        if com_res != 0:
            return ["Error: Server cp malfunctioned"]

        for f in audio_to_be_erased:
            clean_file = f.split("/")[-1]
            command = "cp " + f + " " + backup_directory + \
                "/" + clean_file
            os.system(command)
            if com_res != 0:
                return ["Error: Server cp malfunctioned"]

        for f in audio_to_be_erased:
            command = "rm " + f
            os.system(command)
            if com_res != 0:
                return ["Error: Server rm malfunctioned"]

        return words
  def performSpeechRecognition(self, audio_file, audio_source, user):
    # Check if path exists
    if os.path.isfile(audio_file) == False:
      return ["Error: Something went wrong with the local audio storage\
              Requested path: " + audio_file]

    # Keep extra audio files that need erasing
    audio_to_be_erased = []

    # If it is an .ogg file (from NAO) recode it into .wav
    next_audio_file = audio_file
    prev_audio_file = next_audio_file

    audio_file_folder = os.path.dirname(audio_file)
    if audio_file_folder[-1] != "/":
      audio_file_folder += "/"

    # Check that the audio_source is legit
    if audio_source not in [\
        "headset", \
        "nao_ogg", \
        "nao_wav_4_ch", \
        "nao_wav_1_ch",\
        "nao_wav_1_ch_denoised", \
        "nao_wav_1_ch_only_sox", \
        "nao_wav_1_ch_denoised_only_sox"\
        ]:
      return ["Error: Audio source unrecognized"]

    # Get processing profile
    profile = self.createProcessingProfile(audio_source)


    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = audio_source
    transform_req.source_name = prev_audio_file
    transform_req.target_type = 'wav'

    # Check if sox_transform is needed
    if profile['sox_transform'] == True:
      next_audio_file += "_transformed.wav"
      transform_req.target_name = next_audio_file

      trans_response = self.audio_transform_srv( transform_req )

      if trans_response.error != 'success':
          raise RappError( 'Audio transformation error: ' + error )

      #command = "sox " + prev_audio_file + " " + next_audio_file
      #com_res = os.system(command)
      #if com_res != 0:
        #return ["Error: sox malfunctioned"]
      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['sox_channels_and_rate'] == True:
      next_audio_file += "_mono16k.wav"
      transform_req.target_name = next_audio_file
      transform_req.target_channels = 1
      transform_req.target_rate = 16000

      trans_response = self.audio_transform_srv( transform_req )

      if trans_response.error != 'success':
          raise RappError( 'Audio transformation error: ' + error )
      #command = "sox " + prev_audio_file + " -r 16000 -c 1 " + next_audio_file
      #com_res = os.system(command)
      #if com_res != 0:
        #return ["Error: sox malfunctioned"]
      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['sox_denoising'] == True:
      next_audio_file = prev_audio_file + "_denoised.wav"
      den_request = AudioProcessingDenoiseSrvRequest()
      den_request.audio_file = prev_audio_file
      den_request.denoised_audio_file = next_audio_file
      den_request.audio_type = audio_source
      den_request.user = user
      den_request.scale = profile['sox_denoising_scale']
      den_response = self.denoise_service(den_request)
      if den_response.success != "true":
        return ["Error:" + den_response.success]
      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['detect_silence'] == True:
      # Detect silence
      silence_req = AudioProcessingDetectSilenceSrvRequest()
      silence_req.audio_file = prev_audio_file
      silence_req.threshold = profile['detect_silence_threshold']
      silence_res = self.detect_silence_service(silence_req)
      rapp_print("Silence detection results: " + str(silence_res))
      if silence_res.silence == "true":
        return ["Error: No speech detected. RSD = " + str(silence_res.level)]

    tries = 0
    while tries < 2:
        # Perform energy denoising as well
        if profile['energy_denoising'] == True:
          next_audio_file = prev_audio_file + "_energy_denoised.wav"
          dres = self.performEnergyDenoising(next_audio_file, prev_audio_file, \
                  profile['energy_denoising_init_scale'] + tries * 0.125)
          if dres != "true":
            return ["Error:" + dres]
          audio_to_be_erased.append(next_audio_file)
          prev_audio_file = next_audio_file

        new_audio_file = next_audio_file
        words = self.callSphinxJava(new_audio_file)
        if self.sphinxDied == True:
            self.sphinxDied = False
            break

        if len(words) == 0 or (len(words) == 1 and words[0] == ""):
            tries += 1
        else:
            break

    backup_directory = \
        os.path.expanduser("~/rapp_platform_files/rapp_speech_recognition_sphinx4/")\
        + user
    if not os.path.isdir(backup_directory):
      os.makedirs(backup_directory)

    # Keep the original file:
    command = "cp " + audio_file + " " + backup_directory + "/" + \
            audio_file.split("/")[-1]
    com_res = os.system(command)
    if com_res != 0:
      return ["Error: Server cp malfunctioned"]

    for f in audio_to_be_erased:
      clean_file = f.split("/")[-1]
      command = "cp " + f + " " + backup_directory + \
          "/" + clean_file
      os.system(command)
      if com_res != 0:
        return ["Error: Server cp malfunctioned"]

    for f in audio_to_be_erased:
      command = "rm " + f
      os.system(command)
      if com_res != 0:
        return ["Error: Server rm malfunctioned"]


    return words
  def speech_to_text(self, file_path, user, audio_file_type, language):

    # Check the user
    serv_db_topic = rospy.get_param("rapp_mysql_wrapper_user_fetch_data_topic")
    authentication_service = rospy.ServiceProxy(serv_db_topic, fetchDataSrv)
    req_db = fetchDataSrv()
    req_db.req_cols=["username"]
    entry1=["username", user]
    req_db.where_data=[StringArrayMsg(s=entry1)]

    resp = authentication_service(req_db.req_cols, req_db.where_data)
    if resp.success.data != True or len(resp.res_data) == 0:
      raise RappError("Non authenticated user")

    # Check if file exists
    if not os.path.isfile(file_path):
        raise RappError("Error: file " + file_path + ' not found')

    # Check if file is flac. If not convert it
    new_audio = file_path

    audio_trans_topic = rospy.get_param("rapp_audio_processing_transform_audio_topic")
    audio_transform_srv = rospy.ServiceProxy( audio_trans_topic, AudioProcessingTransformAudioSrv )

    cleanup = []

    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = audio_file_type
    transform_req.source_name = new_audio
    transform_req.target_type = 'wav'
    new_audio += '.wav'
    transform_req.target_name = new_audio
    transform_req.target_channels = 1
    transform_req.target_rate = 16000

    trans_response = audio_transform_srv( transform_req )

    if trans_response.error != 'success':
        raise RappError( trans_response.error )
    cleanup.append(new_audio)

    # Denoise if necessary
    prev_audio_file = new_audio
    next_audio_file = prev_audio_file
    if audio_file_type in ['nao_ogg', 'nao_wav_1_ch', 'nao_wav_4_ch']:
        denoise_topic = rospy.get_param("rapp_audio_processing_denoise_topic")
        energy_denoise_topic = \
            rospy.get_param("rapp_audio_processing_energy_denoise_topic")
        denoise_service = rospy.ServiceProxy(\
            denoise_topic, AudioProcessingDenoiseSrv)
        energy_denoise_service = rospy.ServiceProxy(\
            energy_denoise_topic, AudioProcessingDenoiseSrv)

        manipulation = {}
        manipulation['sox_transform'] = False
        manipulation['sox_denoising'] = False
        manipulation['sox_channels_and_rate'] = False
        if audio_file_type == "headset":
            pass
        elif audio_file_type == "nao_ogg":
            manipulation['sox_transform'] = True
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
        elif audio_file_type == "nao_wav_4_ch":
            manipulation['sox_channels_and_rate'] = True
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
        elif audio_file_type == "nao_wav_1_ch":
            manipulation['sox_denoising'] = True
            manipulation['sox_denoising_scale'] = 0.15
            manipulation['detect_silence'] = True
            manipulation['detect_silence_threshold'] = 0.25

        # Check if sox_transform is needed
        if manipulation['sox_transform'] == True:
            next_audio_file += "_transformed.wav"
            command = "sox " + prev_audio_file + " " + next_audio_file
            com_res = os.system(command)
            if com_res != 0:
                raise RappError("Error: sox malfunctioned")
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file
        if manipulation['sox_channels_and_rate'] == True:
            next_audio_file += "_mono16k.wav"
            command = "sox " + prev_audio_file + " -r 16000 -c 1 " + next_audio_file
            com_res = os.system(command)
            if com_res != 0:
                raise RappError("Error: sox malfunctioned")
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file
        if manipulation['sox_denoising'] == True:
            next_audio_file = prev_audio_file + "_denoised.wav"
            den_request = AudioProcessingDenoiseSrvRequest()
            den_request.audio_file = prev_audio_file
            den_request.denoised_audio_file = next_audio_file
            den_request.audio_type = audio_file_type
            den_request.user = user
            den_request.scale = manipulation['sox_denoising_scale']
            den_response = denoise_service(den_request)
            if den_response.success != "true":
                raise RappError("Error:" + den_response.success)
            cleanup.append(next_audio_file)
            prev_audio_file = next_audio_file

            # must implement a fallback function to clear redundant files

    # Transform to flac
    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = 'headset'
    transform_req.source_name = new_audio
    transform_req.target_type = 'flac'
    newer_audio = new_audio + '.flac'
    transform_req.target_name = newer_audio
    transform_req.target_channels = 1
    transform_req.target_rate = 16000

    trans_response = audio_transform_srv( transform_req )
    cleanup.append(newer_audio)

    if trans_response.error != 'success':
        raise RappError( trans_response.error )


    # Open the file
    with open(newer_audio, "r") as f:
      speech = f.read()
    url = "www.google.com"

    # Fix language
    if language == 'en':
        language = "en-US"
    elif language == 'gr':
        language = 'el'

    #NOTE - Thats a general usage key. They may disable it in the future.
    key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
    path = "/speech-api/v2/recognize?lang=" + language + "&key=" + key
    headers = { "Content-type": "audio/x-flac; rate=22050" };
    params = {"xjerr": "1", "client": "chromium"}
    conn = httplib.HTTPSConnection(url)
    conn.request("POST", path, speech, headers)
    response = conn.getresponse()
    data = response.read()
    initial_data = data
    # Google returns one empty result for some reason here. Removing it..
    index = data.find("}")
    data = data[index + 1:]
    if data == '\n':
        # Returned nothing.. something went wrong
        data = initial_data
    jsdata = json.loads(data)

    # Remove the flac if needed
    for f in cleanup:
        command = 'rm -f ' + f
        if os.system(command):
            raise RappError("Error: Removal of temporary file malfunctioned")
    return jsdata
示例#4
0
  def performSpeechRecognition(self, audio_file, audio_type, user):
    # Check if path exists
    if os.path.isfile(audio_file) == False:
      return ["Error: Something went wrong with the local audio storage\
              Requested path: " + audio_file]

    # Keep extra audio files that need erasing
    audio_to_be_erased = []

    # If it is an .ogg file (from NAO) recode it into .wav
    next_audio_file = audio_file
    prev_audio_file = next_audio_file

    audio_file_folder = os.path.dirname(audio_file)
    if audio_file_folder[-1] != "/":
      audio_file_folder += "/"

    # Check that the audio_type is legit
    if audio_type not in [\
        "headset", \
        "nao_ogg", \
        "nao_wav_4_ch", \
        "nao_wav_1_ch",\
        "nao_wav_1_ch_denoised", \
        "nao_wav_1_ch_only_sox", \
        "nao_wav_1_ch_denoised_only_sox"\
        ]:
      return ["Error: Audio source unrecognized"]

    # Get processing profile
    profile = self._createProcessingProfile(audio_type)

    transform_req = AudioProcessingTransformAudioSrvRequest()
    transform_req.source_type = audio_type
    transform_req.source_name = prev_audio_file
    transform_req.target_type = 'wav'

    # Check if sox_transform is needed
    if profile['sox_transform'] == True:
      next_audio_file += "_transformed.wav"
      transform_req.target_name = next_audio_file

      trans_response = self._audio_transform_srv( transform_req )

      if trans_response.error != 'success':
          return [ 'Audio transformation error: ' + trans_response.error ]
          #raise RappError( 'Audio transformation error: ' + error )

      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['sox_channels_and_rate'] == True:
      next_audio_file += "_mono16k.wav"
      transform_req.target_name = next_audio_file
      transform_req.target_channels = 1
      transform_req.target_rate = 16000

      trans_response = self._audio_transform_srv( transform_req )

      if trans_response.error != 'success':
          return [ 'Audio transformation error: ' + trans_response.error ]
          #raise RappError( 'Audio transformation error: ' + error )
      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['sox_denoising'] == True:
      next_audio_file = prev_audio_file + "_denoised.wav"
      den_request = AudioProcessingDenoiseSrvRequest()
      den_request.audio_file = prev_audio_file
      den_request.denoised_audio_file = next_audio_file
      den_request.audio_type = audio_type
      den_request.user = user
      den_request.scale = profile['sox_denoising_scale']
      den_response = self._denoise_service(den_request)
      if den_response.success != "true":
        return ["Error:" + den_response.success]
      audio_to_be_erased.append(next_audio_file)
      prev_audio_file = next_audio_file
    if profile['detect_silence'] == True:
      # Detect silence
      silence_req = AudioProcessingDetectSilenceSrvRequest()
      silence_req.audio_file = prev_audio_file
      silence_req.threshold = profile['detect_silence_threshold']
      silence_res = self._detect_silence_service(silence_req)
      RappUtilities.rapp_print("Silence detection results: " + str(silence_res))
      if silence_res.silence == "true":
        return ["Error: No speech detected. RSD = " + str(silence_res.level)]

    tries = 0
    while tries < 2:
        # Perform energy denoising as well
        if profile['energy_denoising'] == True:
          next_audio_file = prev_audio_file + "_energy_denoised.wav"
          dres = self._performEnergyDenoising(next_audio_file, prev_audio_file, \
                  profile['energy_denoising_init_scale'] + tries * 0.125)
          if dres != "true":
            return ["Error:" + dres]
          audio_to_be_erased.append(next_audio_file)
          prev_audio_file = next_audio_file

        new_audio_file = next_audio_file
        words = self._callSphinxJava(new_audio_file)
        if self._sphinxDied == True:
            self._sphinxDied = False
            break

        if len(words) == 0 or (len(words) == 1 and words[0] == ""):
            tries += 1
        else:
            break

    for f in audio_to_be_erased:
      try:
          os.remove(f)
      except OSError as e:
        return ["Error: Server rm malfunctioned"]

    return words