示例#1
0
def get_baidu_token(api_key, api_secret, token_url=constants.BAIDU_TOKEN_URL):
    """
    Function for getting Baidu ASR API token
    """
    requests_params = {
        "grant_type": "client_credentials",
        "client_id": api_key,
        "client_secret": api_secret
    }
    post_data = urlencode(requests_params).encode("utf-8")
    result = requests.post(token_url, data=post_data)
    result_str = result.content.decode("utf-8")
    # get the one with valid content
    try:
        result_dict = json.loads(result_str)
        if "access_token" in result_dict and "scope" in result_dict:
            if "audio_voice_assistant_get" not in result_dict["scope"].split(
                    " "):
                raise exceptions.SpeechToTextException(
                    _("Error: Check you project if its ASR feature is enabled."
                      ))
            return result_dict["access_token"]
        raise exceptions.SpeechToTextException(
            json.dumps(result_dict, indent=4, ensure_ascii=False))
    except (ValueError, IndexError):
        # no result
        return ""
示例#2
0
    def __call__(self, filename):
        try:  # pylint: disable=too-many-nested-blocks
            audio_file = open(filename, mode='rb')
            audio_data = audio_file.read()
            audio_file.close()
            if not self.is_keep:
                os.remove(filename)

            for _ in range(self.retries):
                # https://cloud.google.com/speech-to-text/docs/quickstart-protocol
                # https://cloud.google.com/speech-to-text/docs/base64-encoding
                # https://gist.github.com/bretmcg/07e0efe27611d7039c2e4051b4354908
                audio_dict = \
                    {"content": base64.b64encode(audio_data).decode("utf-8")}
                request_data = {"config": self.config, "audio": audio_dict}
                config_json = json.dumps(request_data, ensure_ascii=False)
                requests_result = \
                    requests.post(self.api_url, data=config_json, headers=self.headers)
                requests_result_json = requests_result.content.decode('utf-8')

                try:
                    result_dict = json.loads(requests_result_json)
                except JSONDecodeError:
                    # no result
                    continue

                if 'results' in result_dict and result_dict['results'] \
                        and 'alternatives' in result_dict['results'][0] \
                        and result_dict['results'][0]['alternatives'] \
                        and 'transcript' in result_dict['results'][0]['alternatives'][0]:
                    result_dict = result_dict['results'][0]['alternatives'][0]

                    if 'transcript' not in result_dict:
                        return None

                else:
                    raise exceptions.SpeechToTextException(
                        requests_result_json)

                if 'confidence' in result_dict:
                    confidence = \
                        float(result_dict['confidence'])
                    if confidence > self.min_confidence:
                        result_dict = result_dict['transcript']
                        result = result_dict[:1].upper() + result_dict[1:]
                        result = result.replace('’', '\'')
                        return result
                    return None

                # can't find confidence in json
                # means it's 100% confident
                result_dict = result_dict['transcript']
                result = result_dict[:1].upper() + result_dict[1:]
                result = result.replace('’', '\'')
                return result

        except KeyboardInterrupt:
            return None

        return None
示例#3
0
def gcsv1p1beta1_service_client(filename, is_keep, config, min_confidence):
    """
    Function for performing speech-to-text
    using Google Cloud Speech V1P1Beta1 API for an input FLAC file.
    """
    try:  # pylint: disable=too-many-nested-blocks
        audio_file = open(filename, mode='rb')
        audio_data = audio_file.read()
        audio_file.close()
        if not is_keep:
            os.remove(filename)

        # https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries
        # https://cloud.google.com/speech-to-text/docs/basics
        # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1
        client = speech_v1p1beta1.SpeechClient()
        audio_dict = {"content": audio_data}
        recognize_reponse = client.recognize(config, audio_dict)
        result_dict = MessageToDict(recognize_reponse,
                                    preserving_proto_field_name=True)

        if 'results' in result_dict and result_dict['results'] \
                and 'alternatives' in result_dict['results'][0] \
                and result_dict['results'][0]['alternatives'] \
                and 'transcript' in result_dict['results'][0]['alternatives'][0]:
            result_dict = result_dict['results'][0]['alternatives'][0]

            if 'transcript' not in result_dict:
                return None

        else:
            raise exceptions.SpeechToTextException(
                json.dumps(result_dict, indent=4, ensure_ascii=False))

        if 'confidence' in result_dict:
            confidence = \
                float(result_dict['confidence'])
            if confidence > min_confidence:
                result_dict = result_dict['transcript']
                result = result_dict[:1].upper() + result_dict[1:]
                result = result.replace('’', '\'')
                return result
            return None

        # can't find confidence in json
        # means it's 100% confident
        result_dict = result_dict['transcript']
        result = result_dict[:1].upper() + result_dict[1:]
        result = result.replace('’', '\'')
        return result

    except KeyboardInterrupt:
        return None
示例#4
0
def get_xfyun_transcript(result_dict):
    try:
        code = result_dict["code"]
        if code != 0:
            raise exceptions.SpeechToTextException(
                json.dumps(result_dict, indent=4, ensure_ascii=False))
        else:
            result = ""
            for item in result_dict["data"]["result"]["ws"]:
                result = result + item["cw"][0]["w"]
            return result
    except (KeyError, TypeError):
        return ""
示例#5
0
def get_xfyun_transcript(result_dict):
    """
    Function for getting transcript from Xun Fei Yun Speech-to-Text Websocket API result dictionary.
    Reference: https://www.xfyun.cn/doc/asr/voicedictation/API.html
    """
    try:
        code = result_dict["code"]
        if code != 0:
            raise exceptions.SpeechToTextException(
                json.dumps(result_dict, indent=4, ensure_ascii=False))
        result = ""
        for item in result_dict["data"]["result"]["ws"]:
            result = result + item["cw"][0]["w"]
        return result
    except (KeyError, TypeError):
        return ""
示例#6
0
def get_baidu_transcript(result_dict, delete_chars=None):
    """
    Function for getting transcript from Baidu ASR API result dictionary.
    Reference: https://ai.baidu.com/ai-doc/SPEECH/ek38lxj1u
    """
    try:
        err_no = result_dict["err_no"]
        if err_no != 0:
            raise exceptions.SpeechToTextException(
                json.dumps(result_dict, indent=4, ensure_ascii=False))
        if delete_chars:
            result = result_dict["result"][0].translate(
                str.maketrans(delete_chars, " " * len(delete_chars)))
            return result.rstrip(" ")
        return result_dict["result"][0]
    except (KeyError, TypeError):
        return ""
示例#7
0
def get_gcsv1p1beta1_transcript(
        min_confidence,
        result_dict):
    """
    Function for getting transcript from Google Cloud Speech-to-Text V1P1Beta1 result dictionary.
    """
    if 'results' in result_dict and result_dict['results'] \
            and 'alternatives' in result_dict['results'][0] \
            and result_dict['results'][0]['alternatives'] \
            and 'transcript' in result_dict['results'][0]['alternatives'][0]:
        result_dict = result_dict['results'][0]['alternatives'][0]

        if 'transcript' not in result_dict:
            return None

    else:
        if not result_dict:
            # if api returned empty json, don't throw the exception
            return None
        raise exceptions.SpeechToTextException(
            json.dumps(result_dict, indent=4, ensure_ascii=False))

    if 'confidence' in result_dict:
        confidence = \
            float(result_dict['confidence'])
        if confidence > min_confidence:
            result_dict = result_dict['transcript']
            result = result_dict[:1].upper() + result_dict[1:]
            result = result.replace('’', '\'')
            return result
        return None

    # can't find confidence in json
    # means it's 100% confident
    result_dict = result_dict['transcript']
    result = result_dict[:1].upper() + result_dict[1:]
    result = result.replace('’', '\'')
    return result
示例#8
0
async def xfyun_speech_websocket(url,
                                 app_id,
                                 filename,
                                 business_args,
                                 is_full_result=False):
    data = {
        "status": 0,
        "format": "audio/L16;rate=16000",
        "encoding": "raw",
        "audio": ""
    }
    common_args = {"app_id": app_id}
    business_args = business_args

    transcript = ""
    result_list = []

    try:
        async with websockets.connect(url) as web_socket:
            frame_size = 1280  # 每一帧的音频大小
            interval = 0.04  # 发送音频间隔(单位:s)
            status = 0  # 音频的状态信息,标识音频是第一帧,还是中间帧、最后一帧
            with open(filename, "rb") as fp:
                while True:
                    buf = fp.read(frame_size)
                    # 文件结束
                    if not buf:
                        status = 2
                    data["audio"] = str(base64.b64encode(buf), "utf-8")
                    # 第一帧处理
                    # 发送第一帧音频,带business 参数
                    # appid 必须带上,只需第一帧发送
                    if status == 0:
                        data["status"] = 0
                        web_socket_data = {
                            "common": common_args,
                            "business": business_args,
                            "data": data
                        }
                        status = 1
                    # 中间帧处理
                    elif status == 1:
                        data["status"] = 1
                        web_socket_data = {"data": data}
                    # 最后一帧处理
                    elif status == 2:
                        data["status"] = 2
                        web_socket_data = {"data": data}
                        web_socket_json = json.dumps(web_socket_data)
                        await web_socket.send(web_socket_json)
                        try:
                            result = await web_socket.recv()
                            print(result)
                            web_socket_result = json.loads(result)
                            print(web_socket_result)
                        except (websockets.exceptions.ConnectionClosedOK,
                                ValueError):
                            raise exceptions.SpeechToTextException("")

                        if not is_full_result:
                            transcript = transcript + get_xfyun_transcript(
                                web_socket_result)
                        else:
                            result_list.append(web_socket_result)
                        time.sleep(1)
                        break

                    web_socket_json = json.dumps(web_socket_data)
                    await web_socket.send(web_socket_json)
                    try:
                        result = await web_socket.recv()
                        print(result)
                        web_socket_result = json.loads(result)
                    except websockets.exceptions.ConnectionClosedOK:
                        raise exceptions.SpeechToTextException("")
                    except ValueError:
                        continue

                    if not is_full_result:
                        transcript = transcript + get_xfyun_transcript(
                            web_socket_result)
                    else:
                        result_list.append(web_socket_result)
                    # 模拟音频采样间隔
                    time.sleep(interval)

    except websockets.exceptions.InvalidStatusCode:
        raise exceptions.SpeechToTextException(
            websockets.exceptions.InvalidStatusCode)

    except exceptions.SpeechToTextException:
        if not is_full_result:
            return transcript
        else:
            return result_list
示例#9
0
def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statements, too-many-locals, too-many-arguments
        args,
        input_m=input,
        fps=30.0,
        styles_list=None,
        no_audio_prcs=False):
    """
    Give args and process an input audio or video file.
    """

    if args.http_speech_api:
        gsv2_api_url = "http://" + constants.GOOGLE_SPEECH_V2_API_URL
    else:
        gsv2_api_url = "https://" + constants.GOOGLE_SPEECH_V2_API_URL

    if not args.output_files:
        raise exceptions.AutosubException(
            _("\nNo works done."
              " Check your \"-of\"/\"--output-files\" option."))

    if args.ext_regions:
        # use external speech regions
        print(_("Use external speech regions."))
        audio_wav_temp = tempfile.NamedTemporaryFile(suffix='.wav',
                                                     delete=False)
        audio_wav = audio_wav_temp.name
        audio_wav_temp.close()
        command = args.audio_conversion_cmd.format(in_=args.input,
                                                   channel=1,
                                                   sample_rate=16000,
                                                   out_=audio_wav)
        print(command)
        subprocess.check_output(constants.cmd_conversion(command),
                                stdin=open(os.devnull))
        regions = sub_utils.sub_to_speech_regions(audio_wav=audio_wav,
                                                  sub_file=args.ext_regions)
        os.remove(audio_wav)

    else:
        # use auditok_gen_speech_regions
        mode = 0
        if args.strict_min_length:
            mode = auditok.StreamTokenizer.STRICT_MIN_LENGTH
            if args.drop_trailing_silence:
                mode = mode | auditok.StreamTokenizer.DROP_TRAILING_SILENCE
        elif args.drop_trailing_silence:
            mode = auditok.StreamTokenizer.DROP_TRAILING_SILENCE

        audio_wav_temp = tempfile.NamedTemporaryFile(suffix='.wav',
                                                     delete=False)
        audio_wav = audio_wav_temp.name
        audio_wav_temp.close()
        command = args.audio_conversion_cmd.format(in_=args.input,
                                                   channel=1,
                                                   sample_rate=48000,
                                                   out_=audio_wav)
        print(
            _("\nConvert source audio to \"{name}\" "
              "and get audio length for regions detection.").format(
                  name=audio_wav))
        print(command)
        subprocess.check_output(constants.cmd_conversion(command),
                                stdin=open(os.devnull))

        if not ffmpeg_utils.ffprobe_check_file(audio_wav):
            raise exceptions.AutosubException(
                _("Error: Convert source audio to \"{name}\" failed.").format(
                    name=audio_wav))

        regions = core.auditok_gen_speech_regions(
            audio_wav=audio_wav,
            energy_threshold=args.energy_threshold,
            min_region_size=constants.MIN_REGION_SIZE,
            max_region_size=constants.MAX_REGION_SIZE,
            max_continuous_silence=constants.DEFAULT_CONTINUOUS_SILENCE,
            mode=mode)
        os.remove(audio_wav)
        print(_("\n\"{name}\" has been deleted.").format(name=audio_wav))

    if not regions:
        raise exceptions.AutosubException(
            _("Error: Can't get speech regions."))
    if args.speech_language or \
            args.audio_process and 's' in args.audio_process:
        # process output first
        try:
            args.output_files.remove("regions")
            if args.styles and \
                    (args.format == 'ass' or
                     args.format == 'ssa' or
                     args.format == 'ass.json'):
                times_string = core.list_to_ass_str(
                    text_list=regions,
                    styles_list=styles_list,
                    subtitles_file_format=args.format)
            else:
                times_string = core.list_to_sub_str(
                    timed_text=regions,
                    fps=fps,
                    subtitles_file_format=args.format)
            # times to subtitles string
            times_name = "{base}.{nt}.{extension}".format(
                base=args.output, nt="times", extension=args.format)
            subtitles_file_path = core.str_to_file(str_=times_string,
                                                   output=times_name,
                                                   input_m=input_m)
            # subtitles string to file

            print(
                _("Times file created at \"{}\".").format(subtitles_file_path))

            if not args.output_files:
                raise exceptions.AutosubException(_("\nAll works done."))

        except KeyError:
            pass

        if not no_audio_prcs:
            audio_for_api_temp = tempfile.NamedTemporaryFile(
                suffix=args.api_suffix, delete=False)
            audio_for_api = audio_for_api_temp.name
            audio_for_api_temp.close()
            command = args.audio_conversion_cmd.format(
                in_=args.input,
                channel=args.api_audio_channel,
                sample_rate=args.api_sample_rate,
                out_=audio_for_api)
            print(
                _("\nConvert to \"{name}\" "
                  "for API.").format(name=audio_for_api))
            print(command)
            subprocess.check_output(constants.cmd_conversion(command),
                                    stdin=open(os.devnull))
            if not ffmpeg_utils.ffprobe_check_file(audio_for_api):
                raise exceptions.AutosubException(
                    _("Error: Convert source audio to \"{name}\" failed.").
                    format(name=audio_for_api))

        else:
            audio_for_api = args.input

        audio_fragments = core.bulk_audio_conversion(
            source_file=audio_for_api,
            output=args.output,
            regions=regions,
            split_cmd=args.audio_split_cmd,
            suffix=args.api_suffix,
            concurrency=args.audio_concurrency,
            is_keep=args.keep)

        if not audio_fragments or \
                len(audio_fragments) != len(regions):
            if not args.keep:
                for audio_fragment in audio_fragments:
                    os.remove(audio_fragment)
            raise exceptions.ConversionException(
                _("Error: Conversion failed."))

        if not args.keep:
            os.remove(audio_for_api)
            print(
                _("\n\"{name}\" has been deleted.").format(name=audio_for_api))

        if args.audio_process and 's' in args.audio_process:
            raise exceptions.AutosubException(
                _("Audio processing complete.\nAll works done."))

        # speech to text
        text_list = core.audio_to_text(audio_fragments=audio_fragments,
                                       api_url=gsv2_api_url,
                                       regions=regions,
                                       api_key=args.gspeechv2,
                                       concurrency=args.speech_concurrency,
                                       src_language=args.speech_language,
                                       min_confidence=args.min_confidence,
                                       audio_rate=args.api_sample_rate,
                                       is_keep=args.keep)

        if not text_list or len(text_list) != len(regions):
            raise exceptions.SpeechToTextException(
                _("Error: Speech-to-text failed.\nAll works done."))

        timed_text = get_timed_text(is_empty_dropped=args.drop_empty_regions,
                                    regions=regions,
                                    text_list=text_list)

        if args.dst_language:
            # process output first
            try:
                args.output_files.remove("src")
                if args.styles and \
                        (args.format == 'ass' or
                         args.format == 'ssa' or
                         args.format == 'ass.json'):
                    src_string = core.list_to_ass_str(
                        text_list=timed_text,
                        styles_list=styles_list[:2],
                        subtitles_file_format=args.format,
                    )
                else:
                    src_string = core.list_to_sub_str(
                        timed_text=timed_text,
                        fps=fps,
                        subtitles_file_format=args.format)

                # formatting timed_text to subtitles string
                src_name = "{base}.{nt}.{extension}".format(
                    base=args.output,
                    nt=args.speech_language,
                    extension=args.format)
                subtitles_file_path = core.str_to_file(str_=src_string,
                                                       output=src_name,
                                                       input_m=input_m)
                # subtitles string to file
                print(
                    _("Speech language subtitles "
                      "file created at \"{}\".").format(subtitles_file_path))

                if not args.output_files:
                    raise exceptions.AutosubException(_("\nAll works done."))

            except KeyError:
                pass

            # text translation
            if args.gtransv2:
                # use gtransv2
                translated_text = core.list_to_gtv2(
                    text_list=text_list,
                    api_key=args.gtransv2,
                    concurrency=args.trans_concurrency,
                    src_language=args.src_language,
                    dst_language=args.dst_language,
                    lines_per_trans=args.lines_per_trans)
            else:
                # use googletrans
                translated_text = core.list_to_googletrans(
                    text_list,
                    src_language=args.src_language,
                    dst_language=args.dst_language,
                    sleep_seconds=args.sleep_seconds,
                    user_agent=args.user_agent,
                    service_urls=args.service_urls)

            if not translated_text or len(translated_text) != len(regions):
                raise exceptions.AutosubException(
                    _("Error: Translation failed."))

            timed_trans = get_timed_text(
                is_empty_dropped=args.drop_empty_regions,
                regions=regions,
                text_list=translated_text)

            try:
                args.output_files.remove("bilingual")
                if args.styles and \
                        (args.format == 'ass' or
                         args.format == 'ssa' or
                         args.format == 'ass.json'):
                    bilingual_string = core.list_to_ass_str(
                        text_list=[timed_text, timed_trans],
                        styles_list=styles_list,
                        subtitles_file_format=args.format,
                    )
                else:
                    bilingual_string = core.list_to_sub_str(
                        timed_text=timed_text + timed_trans,
                        fps=fps,
                        subtitles_file_format=args.format)
                # formatting timed_text to subtitles string
                bilingual_name = "{base}.{nt}.{extension}".format(
                    base=args.output,
                    nt=args.src_language + '&' + args.dst_language,
                    extension=args.format)
                subtitles_file_path = core.str_to_file(str_=bilingual_string,
                                                       output=bilingual_name,
                                                       input_m=input_m)
                # subtitles string to file
                print(
                    _("Bilingual subtitles file "
                      "created at \"{}\".").format(subtitles_file_path))

                if not args.output_files:
                    raise exceptions.AutosubException(_("\nAll works done."))

            except KeyError:
                pass

            try:
                args.output_files.remove("dst")
                # formatting timed_text to subtitles string
                if args.styles and \
                        (args.format == 'ass' or
                         args.format == 'ssa' or
                         args.format == 'ass.json'):
                    if len(args.styles) == 4:
                        dst_string = core.list_to_ass_str(
                            text_list=timed_trans,
                            styles_list=styles_list[2:4],
                            subtitles_file_format=args.format,
                        )
                    else:
                        dst_string = core.list_to_ass_str(
                            text_list=timed_trans,
                            styles_list=styles_list,
                            subtitles_file_format=args.format,
                        )
                else:
                    dst_string = core.list_to_sub_str(
                        timed_text=timed_trans,
                        fps=fps,
                        subtitles_file_format=args.format)
                dst_name = "{base}.{nt}.{extension}".format(
                    base=args.output,
                    nt=args.dst_language,
                    extension=args.format)
                subtitles_file_path = core.str_to_file(str_=dst_string,
                                                       output=dst_name,
                                                       input_m=input_m)
                # subtitles string to file
                print(
                    _("Destination language subtitles "
                      "file created at \"{}\".").format(subtitles_file_path))

            except KeyError:
                pass

        else:
            if len(args.output_files) > 1 or not ({"dst", "src"}
                                                  & args.output_files):
                print(
                    _("Override \"-of\"/\"--output-files\" due to your args too few."
                      "\nOutput source subtitles file only."))
            timed_text = get_timed_text(
                is_empty_dropped=args.drop_empty_regions,
                regions=regions,
                text_list=text_list)
            if args.styles and \
                    (args.format == 'ass' or
                     args.format == 'ssa' or
                     args.format == 'ass.json'):
                src_string = core.list_to_ass_str(
                    text_list=timed_text,
                    styles_list=styles_list,
                    subtitles_file_format=args.format,
                )
            else:
                src_string = core.list_to_sub_str(
                    timed_text=timed_text,
                    fps=fps,
                    subtitles_file_format=args.format)
            # formatting timed_text to subtitles string
            src_name = "{base}.{nt}.{extension}".format(
                base=args.output,
                nt=args.speech_language,
                extension=args.format)
            subtitles_file_path = core.str_to_file(str_=src_string,
                                                   output=src_name,
                                                   input_m=input_m)
            # subtitles string to file
            print(
                _("Speech language subtitles "
                  "file created at \"{}\".").format(subtitles_file_path))

    else:
        print(
            _("Override \"-of\"/\"--output-files\" due to your args too few."
              "\nOutput regions subtitles file only."))
        if args.styles and \
                (args.format == 'ass' or
                 args.format == 'ssa' or
                 args.format == 'ass.json'):
            times_subtitles = core.list_to_ass_str(
                text_list=regions,
                styles_list=styles_list,
                subtitles_file_format=args.format)
        else:
            times_subtitles = core.list_to_sub_str(
                timed_text=regions, fps=fps, subtitles_file_format=args.format)
        # times to subtitles string
        times_name = "{base}.{nt}.{extension}".format(base=args.output,
                                                      nt="times",
                                                      extension=args.format)
        subtitles_file_path = core.str_to_file(str_=times_subtitles,
                                               output=times_name,
                                               input_m=input_m)
        # subtitles string to file

        print(_("Times file created at \"{}\".").format(subtitles_file_path))
示例#10
0
 def on_error(self, web_socket, error):  # pylint: disable=no-self-use
     """
     Process the error from WebSocket.
     """
     raise exceptions.SpeechToTextException(error)