def voice(message: str): """產生神經語言的聲音檔案。""" # azure speech 基本設定 speech_key, service_region = Config.SPEECH_TOKEN, Config.SPEECH_REGION speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region, speech_recognition_language=Config.SPEECH_VOICE_NAME) speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=None) # 更新時間戳 _now_timestamp = int(time.time()) _now_time_string = time.strftime("%H%M%S", _now_array) # 建立 XML 檔案並產生聲音檔案、讀取播放 _file_name = f"{_now_time_string}-{randomString()}" build_XAL(message, _file_name) ssml_string = open(f"./talks/xmls/{_now_day_string}/{_file_name}.xml", "r", encoding="utf-8").read() result = speech_synthesizer.speak_ssml_async(ssml_string).get() stream = speechsdk.AudioDataStream(result) stream.save_to_wav_file( f"./talks/voices/{_now_day_string}/{_file_name}.wav") playsound(f"./talks/voices/{_now_day_string}/{_file_name}.wav")
def speech_recognize_keyword_locally_from_microphone(): """runs keyword spotting locally, with direct access to the result audio""" # Creates an instance of a keyword recognition model. Update this to # point to the location of your keyword recognition model. model = speechsdk.KeywordRecognitionModel( "YourKeywordRecognitionModelFile.table") # The phrase your keyword recognition model triggers on. keyword = "YourKeyword" # Create a local keyword recognizer with the default microphone device for input. keyword_recognizer = speechsdk.KeywordRecognizer() done = False def recognized_cb(evt): # Only a keyword phrase is recognized. The result cannot be 'NoMatch' # and there is no timeout. The recognizer runs until a keyword phrase # is detected or recognition is canceled (by stop_recognition_async() # or due to the end of an input file or stream). result = evt.result if result.reason == speechsdk.ResultReason.RecognizedKeyword: print("RECOGNIZED KEYWORD: {}".format(result.text)) nonlocal done done = True def canceled_cb(evt): result = evt.result if result.reason == speechsdk.ResultReason.Canceled: print('CANCELED: {}'.format(result.cancellation_details.reason)) nonlocal done done = True # Connect callbacks to the events fired by the keyword recognizer. keyword_recognizer.recognized.connect(recognized_cb) keyword_recognizer.canceled.connect(canceled_cb) # Start keyword recognition. result_future = keyword_recognizer.recognize_once_async(model) print('Say something starting with "{}" followed by whatever you want...'. format(keyword)) result = result_future.get() # Read result audio (incl. the keyword). if result.reason == speechsdk.ResultReason.RecognizedKeyword: time.sleep(2) # give some time so the stream is filled result_stream = speechsdk.AudioDataStream(result) result_stream.detach_input( ) # stop any more data from input getting to the stream save_future = result_stream.save_to_wav_file_async( "AudioFromRecognizedKeyword.wav") print('Saving file...') saved = save_future.get()
def speech_synthesis_to_audio_data_stream(): """performs speech synthesis and gets the audio data from single request based stream.""" # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a speech synthesizer with a null output stream. # This means the audio output data will not be written to any output channel. # You can just get the audio from the result. speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=None) # Receives a text from console input and synthesizes it to result. while True: print("Enter some text that you want to synthesize, Ctrl-Z to exit") try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized for text [{}]".format(text)) audio_data_stream = speechsdk.AudioDataStream(result) # You can save all the data in the audio data stream to a file file_name = "outputaudio.wav" audio_data_stream.save_to_wav_file(file_name) print("Audio data for text [{}] was saved to [{}]".format( text, file_name)) # You can also read data from audio data stream and process it in memory # Reset the stream position to the beginning since saving to file puts the postion to end. audio_data_stream.position = 0 # Reads data from the stream audio_buffer = bytes(16000) total_size = 0 filled_size = audio_data_stream.read_data(audio_buffer) while filled_size > 0: print("{} bytes received.".format(filled_size)) total_size += filled_size filled_size = audio_data_stream.read_data(audio_buffer) print("Totally {} bytes received for text [{}].".format( total_size, text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def call_tts(text, tgt_lang): synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) xml_body = ElementTree.Element('speak', version='1.0') xml_body.set("xmlns", 'https://www.w3.org/2001/10/synthesis') xml_body.set("xml:lang", tts_lang[tgt_lang]) voice = ElementTree.SubElement(xml_body, 'voice') voice.set("name", tts_voice[tgt_lang]) voice.text = str(text) body = ElementTree.tostring(xml_body) result = synthesizer.speak_ssml(body.decode("utf-8")) stream = speechsdk.AudioDataStream(result) audio_filename = str(time.time()) + ".wav" stream.save_to_wav_file("static/" + audio_filename) print("Audio file saved to: " + audio_filename, "lang: " + tgt_lang) return audio_filename
def xmlaudio(): synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) xml_body = ElementTree.Element('speak', version='1.0') # xml_body.set('{https://www.w3.org/2001/10/synthesis}lang', 'en-US') xml_body.set("xmlns", 'https://www.w3.org/2001/10/synthesis') xml_body.set("xml:lang", 'hi-IN') voice = ElementTree.SubElement(xml_body, 'voice') voice.set("name", 'hi-IN-Kalpana-Apollo') voice.text = str("हिंदी") body = ElementTree.tostring(xml_body) # f = open("sample.xml","w") # f.write(str(body)) # ssml_string = open("sample.xml", "r",encoding="UTF-8").read() # print(ssml_string) result = synthesizer.speak_ssml_async(body.decode("utf-8")).get() stream = speechsdk.AudioDataStream(result) stream.save_to_wav_file("voice.wav")
def saveAudioFile(self, result, filename): audioStream = speechsdk.AudioDataStream(result) audioStream.save_to_wav_file(filename)
def image_caption(request): voice_num = int(request.data.get('num')) voice_index = [ { 'name': 'en-US-AriaRUS', 'pitch': '-10%', 'rate': '-10%' }, { 'name': 'en-US-ZiraRUS', 'pitch': '20%', 'rate': '-10%' }, { 'name': 'en-US-GuyRUS', 'pitch': '10%', 'rate': '-20%' }, { 'name': 'en-US-BenjaminRUS', 'pitch': '20%', 'rate': '-20%' }, ] mediaURL = getattr(settings, 'MEDIA_URL', 'MEDIA_URL') mediaROOTURL = getattr(settings, 'MEDIA_ROOT', 'MEDIA_ROOT') speech_key, service_region = getattr(settings, 'MS_API_KEY', 'MS_API_KEY'), "koreacentral" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat["Riff16Khz16BitMonoPcm"]) synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) if voice_num != 4: speak = ET.Element('speak') speak.set('version', '1.0') speak.set('xmlns', 'https://www.w3.org/2001/10/synthesis') speak.set('xml:lang', 'en-US') voice = ET.SubElement(speak, 'voice') voice.set('name', voice_index[voice_num]['name']) prosody = ET.SubElement(voice, 'prosody') prosody.set('rate', voice_index[voice_num]['rate']) prosody.set('pitch', voice_index[voice_num]['pitch']) try: img = request.data.get('img') except: return Response({'error': '이미지 잘못 들어왔어요'}, status=status.HTTP_400_BAD_REQUEST) MSVS_API_KEY = getattr(settings, 'MSVS_API_KEY', 'MSVS_API_KEY') endpoint = "https://jes5918.cognitiveservices.azure.com/" computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(MSVS_API_KEY)) try: tags_result_remote = computervision_client.tag_image_in_stream(img) except: return Response({'error': '이미지 파일 형식을 확인하세요.'}, status=status.HTTP_400_BAD_REQUEST) if (len(tags_result_remote.tags) == 0): return Response({'error' : '생성된 태그가 없습니다.'}, status=status.HTTP_400_NOT_FOUND) else: captiontags = [] body = [] for idx, tag in enumerate(tags_result_remote.tags): if idx == 8: break if voice_num == 4: dockerUrl = "http://j4b105.p.ssafy.io:5002/api/tts?text=" + tag.name responseData = requests.request("GET", dockerUrl) data, samplerate = sf.read(io.BytesIO(responseData.content)) stream_path = mediaROOTURL+ '/tts_basic/' + str(voice_num) + tag.name + '.wav' sf.write(stream_path, data, samplerate) for i in range(5): if i == voice_num: continue stream_path2 = mediaROOTURL+ '/tts_basic/' + str(i) + tag.name + '.wav' if not os.path.isfile(stream_path2): sf.write(stream_path2, data, samplerate) else: prosody.text = tag.name mydata = ET.tostring(speak).decode("utf-8") result = synthesizer.speak_ssml_async(mydata).get() stream = speechsdk.AudioDataStream(result) stream_path = mediaROOTURL+ '/tts_basic/' + str(voice_num) + tag.name + '.wav' # Checks result.. if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: stream.save_to_wav_file(stream_path) for i in range(5): if i == voice_num: continue stream_path2 = mediaROOTURL+ '/tts_basic/' + str(i) + tag.name + '.wav' if not os.path.isfile(stream_path2): stream.save_to_wav_file(stream_path2) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: if cancellation_details.error_details: print("Error details: {}".format(cancellation_details.error_details)) print("Did you update the subscription info?") return Response({'error' : 'voice tts error please retry'}, status=status.HTTP_503_SERVICE_UNAVAILABLE) captiontags.append({ 'content': tag.name, 'filepath': mediaURL+'tts_basic/' + str(voice_num) + tag.name + '.wav', 'checked': False }) body.append({'text': tag.name}) endpoint = "https://api.cognitive.microsofttranslator.com/dictionary/lookup" params = { 'api-version': '3.0', 'from': 'en', 'to': 'ko' } headers = { 'Ocp-Apim-Subscription-Key': getattr(settings, 'MSTR_API_KEY', 'MSTR_API_KEY'), 'Ocp-Apim-Subscription-Region': "koreacentral", 'Content-type': 'application/json', 'X-ClientTraceId': str(uuid.uuid4()) } try: request = requests.post(endpoint, params=params, headers=headers, json=body) except: return Response({'error' : '번역에 에러가 발생'}, status=status.HTTP_400_BAD_REQUEST) response = request.json() posIndex = { 'ADJ': '형용사', 'ADV': '부사', 'CONJ': '접속사', 'DET': '한정사', 'MODAL': '동사', 'NOUN': '명사', 'PREP': '전치사', 'PRON': '대명사', 'VERB': '동사', 'OTHER': '기타', } remove_idx = [] for i in range(len(body)): try: captiontags[i]["mean"] = response[i]["translations"][0]["displayTarget"] captiontags[i]["part"] = posIndex[response[i]["translations"][0]["posTag"]] if captiontags[i]["content"] == captiontags[i]["mean"]: remove_idx.append(i) except: remove_idx.append(i) captiontags[i]["mean"] = "nottrans" captiontags[i]["part"] = "nottrans" captiontags_res = [] for idx, c in enumerate(captiontags): if idx in remove_idx: continue captiontags_res.append(c) return Response({'data' : captiontags_res}, status=status.HTTP_200_OK)
def text_to_speech(request): text = request.data.get('text') for content in text: if ord('ㄱ') <= ord(content) <= ord('힣'): return Response({'error': '한글은 작성할 수 없습니다'}, status=status.HTTP_400_BAD_REQUEST) voice_num = int(request.data.get('num')) voice_index = [{ 'name': 'en-US-AriaRUS', 'pitch': '-10%', 'rate': '-10%' }, { 'name': 'en-US-ZiraRUS', 'pitch': '20%', 'rate': '-10%' }, { 'name': 'en-US-GuyRUS', 'pitch': '10%', 'rate': '-20%' }, { 'name': 'en-US-BenjaminRUS', 'pitch': '20%', 'rate': '-20%' }, { 'name': 'en-US-AriaRUS', 'pitch': '30%', 'rate': '-30%' }] mediaURL = getattr(settings, 'MEDIA_URL', 'MEDIA_URL') mediaROOTURL = getattr(settings, 'MEDIA_ROOT', 'MEDIA_ROOT') speech_key, service_region = getattr(settings, 'MS_API_KEY', 'MS_API_KEY'), "koreacentral" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_config.set_speech_synthesis_output_format( speechsdk.SpeechSynthesisOutputFormat["Riff16Khz16BitMonoPcm"]) synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) speak = ET.Element('speak') speak.set('version', '1.0') speak.set('xmlns', 'https://www.w3.org/2001/10/synthesis') speak.set('xml:lang', 'en-US') voice = ET.SubElement(speak, 'voice') voice.set('name', voice_index[voice_num]['name']) prosody = ET.SubElement(voice, 'prosody') prosody.set('rate', voice_index[voice_num]['rate']) prosody.set('pitch', voice_index[voice_num]['pitch']) prosody.text = text mydata = ET.tostring(speak).decode("utf-8") result = synthesizer.speak_ssml_async(mydata).get() stream = speechsdk.AudioDataStream(result) temp = ('').join(text.split(' ')).lower() temp2 = ('').join(temp.split('.')) stream_path = mediaROOTURL + '/tts_basic/' + temp2 + '.wav' # Checks result. if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: stream.save_to_wav_file(stream_path) print("complete") return Response({'filepath': mediaURL + 'tts_basic/' + temp2 + '.wav'}, status=status.HTTP_200_OK) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: if cancellation_details.error_details: print("Error details: {}".format( cancellation_details.error_details)) print("Did you update the subscription info?") return Response({'error': '서버 에러입니다.'}, status=status.HTTP_503_SERVICE_UNAVAILABLE) return Response({'error': '서버 에러입니다.'}, status=status.HTTP_503_SERVICE_UNAVAILABLE)