def get_baidu_token(api_key, api_secret, token_url=constants.BAIDU_TOKEN_URL): """ Function for getting Baidu ASR API token """ requests_params = { "grant_type": "client_credentials", "client_id": api_key, "client_secret": api_secret } post_data = urlencode(requests_params).encode("utf-8") result = requests.post(token_url, data=post_data) result_str = result.content.decode("utf-8") # get the one with valid content try: result_dict = json.loads(result_str) if "access_token" in result_dict and "scope" in result_dict: if "audio_voice_assistant_get" not in result_dict["scope"].split( " "): raise exceptions.SpeechToTextException( _("Error: Check you project if its ASR feature is enabled." )) return result_dict["access_token"] raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) except (ValueError, IndexError): # no result return ""
def __call__(self, filename): try: # pylint: disable=too-many-nested-blocks audio_file = open(filename, mode='rb') audio_data = audio_file.read() audio_file.close() if not self.is_keep: os.remove(filename) for _ in range(self.retries): # https://cloud.google.com/speech-to-text/docs/quickstart-protocol # https://cloud.google.com/speech-to-text/docs/base64-encoding # https://gist.github.com/bretmcg/07e0efe27611d7039c2e4051b4354908 audio_dict = \ {"content": base64.b64encode(audio_data).decode("utf-8")} request_data = {"config": self.config, "audio": audio_dict} config_json = json.dumps(request_data, ensure_ascii=False) requests_result = \ requests.post(self.api_url, data=config_json, headers=self.headers) requests_result_json = requests_result.content.decode('utf-8') try: result_dict = json.loads(requests_result_json) except JSONDecodeError: # no result continue if 'results' in result_dict and result_dict['results'] \ and 'alternatives' in result_dict['results'][0] \ and result_dict['results'][0]['alternatives'] \ and 'transcript' in result_dict['results'][0]['alternatives'][0]: result_dict = result_dict['results'][0]['alternatives'][0] if 'transcript' not in result_dict: return None else: raise exceptions.SpeechToTextException( requests_result_json) if 'confidence' in result_dict: confidence = \ float(result_dict['confidence']) if confidence > self.min_confidence: result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result return None # can't find confidence in json # means it's 100% confident result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result except KeyboardInterrupt: return None return None
def gcsv1p1beta1_service_client(filename, is_keep, config, min_confidence): """ Function for performing speech-to-text using Google Cloud Speech V1P1Beta1 API for an input FLAC file. """ try: # pylint: disable=too-many-nested-blocks audio_file = open(filename, mode='rb') audio_data = audio_file.read() audio_file.close() if not is_keep: os.remove(filename) # https://cloud.google.com/speech-to-text/docs/quickstart-client-libraries # https://cloud.google.com/speech-to-text/docs/basics # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1 client = speech_v1p1beta1.SpeechClient() audio_dict = {"content": audio_data} recognize_reponse = client.recognize(config, audio_dict) result_dict = MessageToDict(recognize_reponse, preserving_proto_field_name=True) if 'results' in result_dict and result_dict['results'] \ and 'alternatives' in result_dict['results'][0] \ and result_dict['results'][0]['alternatives'] \ and 'transcript' in result_dict['results'][0]['alternatives'][0]: result_dict = result_dict['results'][0]['alternatives'][0] if 'transcript' not in result_dict: return None else: raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) if 'confidence' in result_dict: confidence = \ float(result_dict['confidence']) if confidence > min_confidence: result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result return None # can't find confidence in json # means it's 100% confident result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result except KeyboardInterrupt: return None
def get_xfyun_transcript(result_dict): try: code = result_dict["code"] if code != 0: raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) else: result = "" for item in result_dict["data"]["result"]["ws"]: result = result + item["cw"][0]["w"] return result except (KeyError, TypeError): return ""
def get_xfyun_transcript(result_dict): """ Function for getting transcript from Xun Fei Yun Speech-to-Text Websocket API result dictionary. Reference: https://www.xfyun.cn/doc/asr/voicedictation/API.html """ try: code = result_dict["code"] if code != 0: raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) result = "" for item in result_dict["data"]["result"]["ws"]: result = result + item["cw"][0]["w"] return result except (KeyError, TypeError): return ""
def get_baidu_transcript(result_dict, delete_chars=None): """ Function for getting transcript from Baidu ASR API result dictionary. Reference: https://ai.baidu.com/ai-doc/SPEECH/ek38lxj1u """ try: err_no = result_dict["err_no"] if err_no != 0: raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) if delete_chars: result = result_dict["result"][0].translate( str.maketrans(delete_chars, " " * len(delete_chars))) return result.rstrip(" ") return result_dict["result"][0] except (KeyError, TypeError): return ""
def get_gcsv1p1beta1_transcript( min_confidence, result_dict): """ Function for getting transcript from Google Cloud Speech-to-Text V1P1Beta1 result dictionary. """ if 'results' in result_dict and result_dict['results'] \ and 'alternatives' in result_dict['results'][0] \ and result_dict['results'][0]['alternatives'] \ and 'transcript' in result_dict['results'][0]['alternatives'][0]: result_dict = result_dict['results'][0]['alternatives'][0] if 'transcript' not in result_dict: return None else: if not result_dict: # if api returned empty json, don't throw the exception return None raise exceptions.SpeechToTextException( json.dumps(result_dict, indent=4, ensure_ascii=False)) if 'confidence' in result_dict: confidence = \ float(result_dict['confidence']) if confidence > min_confidence: result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result return None # can't find confidence in json # means it's 100% confident result_dict = result_dict['transcript'] result = result_dict[:1].upper() + result_dict[1:] result = result.replace('’', '\'') return result
async def xfyun_speech_websocket(url, app_id, filename, business_args, is_full_result=False): data = { "status": 0, "format": "audio/L16;rate=16000", "encoding": "raw", "audio": "" } common_args = {"app_id": app_id} business_args = business_args transcript = "" result_list = [] try: async with websockets.connect(url) as web_socket: frame_size = 1280 # 每一帧的音频大小 interval = 0.04 # 发送音频间隔(单位:s) status = 0 # 音频的状态信息,标识音频是第一帧,还是中间帧、最后一帧 with open(filename, "rb") as fp: while True: buf = fp.read(frame_size) # 文件结束 if not buf: status = 2 data["audio"] = str(base64.b64encode(buf), "utf-8") # 第一帧处理 # 发送第一帧音频,带business 参数 # appid 必须带上,只需第一帧发送 if status == 0: data["status"] = 0 web_socket_data = { "common": common_args, "business": business_args, "data": data } status = 1 # 中间帧处理 elif status == 1: data["status"] = 1 web_socket_data = {"data": data} # 最后一帧处理 elif status == 2: data["status"] = 2 web_socket_data = {"data": data} web_socket_json = json.dumps(web_socket_data) await web_socket.send(web_socket_json) try: result = await web_socket.recv() print(result) web_socket_result = json.loads(result) print(web_socket_result) except (websockets.exceptions.ConnectionClosedOK, ValueError): raise exceptions.SpeechToTextException("") if not is_full_result: transcript = transcript + get_xfyun_transcript( web_socket_result) else: result_list.append(web_socket_result) time.sleep(1) break web_socket_json = json.dumps(web_socket_data) await web_socket.send(web_socket_json) try: result = await web_socket.recv() print(result) web_socket_result = json.loads(result) except websockets.exceptions.ConnectionClosedOK: raise exceptions.SpeechToTextException("") except ValueError: continue if not is_full_result: transcript = transcript + get_xfyun_transcript( web_socket_result) else: result_list.append(web_socket_result) # 模拟音频采样间隔 time.sleep(interval) except websockets.exceptions.InvalidStatusCode: raise exceptions.SpeechToTextException( websockets.exceptions.InvalidStatusCode) except exceptions.SpeechToTextException: if not is_full_result: return transcript else: return result_list
def audio_or_video_prcs( # pylint: disable=too-many-branches, too-many-statements, too-many-locals, too-many-arguments args, input_m=input, fps=30.0, styles_list=None, no_audio_prcs=False): """ Give args and process an input audio or video file. """ if args.http_speech_api: gsv2_api_url = "http://" + constants.GOOGLE_SPEECH_V2_API_URL else: gsv2_api_url = "https://" + constants.GOOGLE_SPEECH_V2_API_URL if not args.output_files: raise exceptions.AutosubException( _("\nNo works done." " Check your \"-of\"/\"--output-files\" option.")) if args.ext_regions: # use external speech regions print(_("Use external speech regions.")) audio_wav_temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) audio_wav = audio_wav_temp.name audio_wav_temp.close() command = args.audio_conversion_cmd.format(in_=args.input, channel=1, sample_rate=16000, out_=audio_wav) print(command) subprocess.check_output(constants.cmd_conversion(command), stdin=open(os.devnull)) regions = sub_utils.sub_to_speech_regions(audio_wav=audio_wav, sub_file=args.ext_regions) os.remove(audio_wav) else: # use auditok_gen_speech_regions mode = 0 if args.strict_min_length: mode = auditok.StreamTokenizer.STRICT_MIN_LENGTH if args.drop_trailing_silence: mode = mode | auditok.StreamTokenizer.DROP_TRAILING_SILENCE elif args.drop_trailing_silence: mode = auditok.StreamTokenizer.DROP_TRAILING_SILENCE audio_wav_temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) audio_wav = audio_wav_temp.name audio_wav_temp.close() command = args.audio_conversion_cmd.format(in_=args.input, channel=1, sample_rate=48000, out_=audio_wav) print( _("\nConvert source audio to \"{name}\" " "and get audio length for regions detection.").format( name=audio_wav)) print(command) subprocess.check_output(constants.cmd_conversion(command), stdin=open(os.devnull)) if not ffmpeg_utils.ffprobe_check_file(audio_wav): raise exceptions.AutosubException( _("Error: Convert source audio to \"{name}\" failed.").format( name=audio_wav)) regions = core.auditok_gen_speech_regions( audio_wav=audio_wav, energy_threshold=args.energy_threshold, min_region_size=constants.MIN_REGION_SIZE, max_region_size=constants.MAX_REGION_SIZE, max_continuous_silence=constants.DEFAULT_CONTINUOUS_SILENCE, mode=mode) os.remove(audio_wav) print(_("\n\"{name}\" has been deleted.").format(name=audio_wav)) if not regions: raise exceptions.AutosubException( _("Error: Can't get speech regions.")) if args.speech_language or \ args.audio_process and 's' in args.audio_process: # process output first try: args.output_files.remove("regions") if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): times_string = core.list_to_ass_str( text_list=regions, styles_list=styles_list, subtitles_file_format=args.format) else: times_string = core.list_to_sub_str( timed_text=regions, fps=fps, subtitles_file_format=args.format) # times to subtitles string times_name = "{base}.{nt}.{extension}".format( base=args.output, nt="times", extension=args.format) subtitles_file_path = core.str_to_file(str_=times_string, output=times_name, input_m=input_m) # subtitles string to file print( _("Times file created at \"{}\".").format(subtitles_file_path)) if not args.output_files: raise exceptions.AutosubException(_("\nAll works done.")) except KeyError: pass if not no_audio_prcs: audio_for_api_temp = tempfile.NamedTemporaryFile( suffix=args.api_suffix, delete=False) audio_for_api = audio_for_api_temp.name audio_for_api_temp.close() command = args.audio_conversion_cmd.format( in_=args.input, channel=args.api_audio_channel, sample_rate=args.api_sample_rate, out_=audio_for_api) print( _("\nConvert to \"{name}\" " "for API.").format(name=audio_for_api)) print(command) subprocess.check_output(constants.cmd_conversion(command), stdin=open(os.devnull)) if not ffmpeg_utils.ffprobe_check_file(audio_for_api): raise exceptions.AutosubException( _("Error: Convert source audio to \"{name}\" failed."). format(name=audio_for_api)) else: audio_for_api = args.input audio_fragments = core.bulk_audio_conversion( source_file=audio_for_api, output=args.output, regions=regions, split_cmd=args.audio_split_cmd, suffix=args.api_suffix, concurrency=args.audio_concurrency, is_keep=args.keep) if not audio_fragments or \ len(audio_fragments) != len(regions): if not args.keep: for audio_fragment in audio_fragments: os.remove(audio_fragment) raise exceptions.ConversionException( _("Error: Conversion failed.")) if not args.keep: os.remove(audio_for_api) print( _("\n\"{name}\" has been deleted.").format(name=audio_for_api)) if args.audio_process and 's' in args.audio_process: raise exceptions.AutosubException( _("Audio processing complete.\nAll works done.")) # speech to text text_list = core.audio_to_text(audio_fragments=audio_fragments, api_url=gsv2_api_url, regions=regions, api_key=args.gspeechv2, concurrency=args.speech_concurrency, src_language=args.speech_language, min_confidence=args.min_confidence, audio_rate=args.api_sample_rate, is_keep=args.keep) if not text_list or len(text_list) != len(regions): raise exceptions.SpeechToTextException( _("Error: Speech-to-text failed.\nAll works done.")) timed_text = get_timed_text(is_empty_dropped=args.drop_empty_regions, regions=regions, text_list=text_list) if args.dst_language: # process output first try: args.output_files.remove("src") if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): src_string = core.list_to_ass_str( text_list=timed_text, styles_list=styles_list[:2], subtitles_file_format=args.format, ) else: src_string = core.list_to_sub_str( timed_text=timed_text, fps=fps, subtitles_file_format=args.format) # formatting timed_text to subtitles string src_name = "{base}.{nt}.{extension}".format( base=args.output, nt=args.speech_language, extension=args.format) subtitles_file_path = core.str_to_file(str_=src_string, output=src_name, input_m=input_m) # subtitles string to file print( _("Speech language subtitles " "file created at \"{}\".").format(subtitles_file_path)) if not args.output_files: raise exceptions.AutosubException(_("\nAll works done.")) except KeyError: pass # text translation if args.gtransv2: # use gtransv2 translated_text = core.list_to_gtv2( text_list=text_list, api_key=args.gtransv2, concurrency=args.trans_concurrency, src_language=args.src_language, dst_language=args.dst_language, lines_per_trans=args.lines_per_trans) else: # use googletrans translated_text = core.list_to_googletrans( text_list, src_language=args.src_language, dst_language=args.dst_language, sleep_seconds=args.sleep_seconds, user_agent=args.user_agent, service_urls=args.service_urls) if not translated_text or len(translated_text) != len(regions): raise exceptions.AutosubException( _("Error: Translation failed.")) timed_trans = get_timed_text( is_empty_dropped=args.drop_empty_regions, regions=regions, text_list=translated_text) try: args.output_files.remove("bilingual") if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): bilingual_string = core.list_to_ass_str( text_list=[timed_text, timed_trans], styles_list=styles_list, subtitles_file_format=args.format, ) else: bilingual_string = core.list_to_sub_str( timed_text=timed_text + timed_trans, fps=fps, subtitles_file_format=args.format) # formatting timed_text to subtitles string bilingual_name = "{base}.{nt}.{extension}".format( base=args.output, nt=args.src_language + '&' + args.dst_language, extension=args.format) subtitles_file_path = core.str_to_file(str_=bilingual_string, output=bilingual_name, input_m=input_m) # subtitles string to file print( _("Bilingual subtitles file " "created at \"{}\".").format(subtitles_file_path)) if not args.output_files: raise exceptions.AutosubException(_("\nAll works done.")) except KeyError: pass try: args.output_files.remove("dst") # formatting timed_text to subtitles string if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): if len(args.styles) == 4: dst_string = core.list_to_ass_str( text_list=timed_trans, styles_list=styles_list[2:4], subtitles_file_format=args.format, ) else: dst_string = core.list_to_ass_str( text_list=timed_trans, styles_list=styles_list, subtitles_file_format=args.format, ) else: dst_string = core.list_to_sub_str( timed_text=timed_trans, fps=fps, subtitles_file_format=args.format) dst_name = "{base}.{nt}.{extension}".format( base=args.output, nt=args.dst_language, extension=args.format) subtitles_file_path = core.str_to_file(str_=dst_string, output=dst_name, input_m=input_m) # subtitles string to file print( _("Destination language subtitles " "file created at \"{}\".").format(subtitles_file_path)) except KeyError: pass else: if len(args.output_files) > 1 or not ({"dst", "src"} & args.output_files): print( _("Override \"-of\"/\"--output-files\" due to your args too few." "\nOutput source subtitles file only.")) timed_text = get_timed_text( is_empty_dropped=args.drop_empty_regions, regions=regions, text_list=text_list) if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): src_string = core.list_to_ass_str( text_list=timed_text, styles_list=styles_list, subtitles_file_format=args.format, ) else: src_string = core.list_to_sub_str( timed_text=timed_text, fps=fps, subtitles_file_format=args.format) # formatting timed_text to subtitles string src_name = "{base}.{nt}.{extension}".format( base=args.output, nt=args.speech_language, extension=args.format) subtitles_file_path = core.str_to_file(str_=src_string, output=src_name, input_m=input_m) # subtitles string to file print( _("Speech language subtitles " "file created at \"{}\".").format(subtitles_file_path)) else: print( _("Override \"-of\"/\"--output-files\" due to your args too few." "\nOutput regions subtitles file only.")) if args.styles and \ (args.format == 'ass' or args.format == 'ssa' or args.format == 'ass.json'): times_subtitles = core.list_to_ass_str( text_list=regions, styles_list=styles_list, subtitles_file_format=args.format) else: times_subtitles = core.list_to_sub_str( timed_text=regions, fps=fps, subtitles_file_format=args.format) # times to subtitles string times_name = "{base}.{nt}.{extension}".format(base=args.output, nt="times", extension=args.format) subtitles_file_path = core.str_to_file(str_=times_subtitles, output=times_name, input_m=input_m) # subtitles string to file print(_("Times file created at \"{}\".").format(subtitles_file_path))
def on_error(self, web_socket, error): # pylint: disable=no-self-use """ Process the error from WebSocket. """ raise exceptions.SpeechToTextException(error)