def speech_recognize_once_from_file_with_customized_model(): """performs one-shot speech recognition with input from an audio file, specifying a custom model""" # <SpeechRecognitionUsingCustomizedModel> speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Set the endpoint ID of your customized model # Replace with your own CRIS endpoint ID. speech_config.endpoint_id = "YourEndpointId" audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename) # Creates a speech recognizer using a file as audio input. # The default language is "en-us". speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) # Starts speech recognition, and returns after a single utterance is recognized. The end of a # single utterance is determined by listening for silence at the end or until a maximum of 15 # seconds of audio is processed. It returns the recognition text as result. # Note: Since recognize_once() returns only a single utterance, it is suitable only for single # shot recognition like command or query. # For long-running multi-utterance recognition, use start_continuous_recognition() instead. result = speech_recognizer.recognize_once() # Check the result if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def transcribe_streaming(stream_file, result_file): """Streams transcription of the given audio file.""" import time # Microsoft authentication - add your API key on the line below. Using 'uksouth' as I am in the UK. speech_key, service_region = "your key", "uksouth" ## Add your key speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Return word timings! speech_config.request_word_level_timestamps() audio_input = speechsdk.AudioConfig(filename=stream_file) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input) done = False def stop_cb(evt): """callback that signals to stop continuous recognition upon receiving an event `evt`""" store('CLOSING on {}'.format(evt), result_file) nonlocal done done = True speech_recognizer.recognizing.connect(lambda evt: store('RECOGNIZING: {}'.format(evt.result.json), result_file)) speech_recognizer.recognized.connect(lambda evt: store('JSON: {}'.format(evt.result.json), result_file)) speech_recognizer.session_started.connect(lambda evt: store('SESSION STARTED: {}'.format(evt), result_file)) speech_recognizer.session_stopped.connect(lambda evt: store('SESSION STOPPED {}'.format(evt), result_file)) speech_recognizer.canceled.connect(lambda evt: store('CANCELED {}'.format(evt), result_file)) # stop continuous recognition on either session stopped or canceled events speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: # Request current hypothesis every 0.05s time.sleep(.05) speech_recognizer.stop_continuous_recognition()
def speech_synthesis_viseme_event(): """performs speech synthesis and shows the viseme event.""" # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a speech synthesizer with a null output stream. # This means the audio output data will not be written to any output channel. # You can just get the audio from the result. speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=None) # Subscribes to viseme received event # The unit of evt.audio_offset is tick (1 tick = 100 nanoseconds), divide it by 10,000 to convert to milliseconds. speech_synthesizer.viseme_received.connect(lambda evt: print( "Viseme event received: audio offset: {}ms, viseme id: {}.".format( evt.audio_offset / 10000, evt.viseme_id))) # Receives a text from console input and synthesizes it to result. while True: print("Enter some text that you want to synthesize, Ctrl-Z to exit") try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized for text [{}]".format(text)) audio_data = result.audio_data print("{} bytes of audio data received.".format(len(audio_data))) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def riconosci_audio(): pygame.mixer.init() # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and service region (e.g., "westus"). speech_key, service_region = "fcf5b7ab293f41daa91b9daabb2076b3", "francecentral" speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region, speech_recognition_language="it-IT") # Creates a recognizer with the given settings pygame.mixer.music.load("./sound/beep.mp3") pygame.mixer.music.play() speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config) print("Sto registrando:") # Performs recognition. recognize_once() returns when the first utterance has been recognized, # so it is suitable only for single shot recognition like command or query. For long-running # recognition, use start_continuous_recognition() instead, or if you want to run recognition in a # non-blocking manner, use recognize_once_async(). result = speech_recognizer.recognize_once() pygame.mixer.music.load("./sound/golf.mp3") pygame.mixer.music.play() # Checks result. if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) return (result.text) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def speech_recognize_continuous_from_file(): """performs continuous speech recognition with input from an audio file""" # <SpeechContinuousRecognitionWithFile> speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) done = False def stop_cb(evt): """callback that stops continuous recognition upon receiving an event `evt`""" print('CLOSING on {}'.format(evt)) speech_recognizer.stop_continuous_recognition() nonlocal done done = True # Connect callbacks to the events fired by the speech recognizer speech_recognizer.recognizing.connect( lambda evt: print('RECOGNIZING: {}'.format(evt))) speech_recognizer.recognized.connect( lambda evt: print('RECOGNIZED: {}'.format(evt))) speech_recognizer.session_started.connect( lambda evt: print('SESSION STARTED: {}'.format(evt))) speech_recognizer.session_stopped.connect( lambda evt: print('SESSION STOPPED {}'.format(evt))) speech_recognizer.canceled.connect( lambda evt: print('CANCELED {}'.format(evt))) # stop continuous recognition on either session stopped or canceled events speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5)
def record_audio(ask=False): speech_key, service_region = "61b8a438ab6e4afa8d7496ab6982d4e3", "eastus" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config) result = speech_recognizer.recognize_once() if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details)) return result.text.lower()
def msft_tts(text): text = text[0:101] # Creates an instance of a speech config with specified subscription key and service region. speech_key, service_region = '86a1e0bee52f41e29fee0eae40f94f3c', 'uksouth' speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # speech_config.speech_synthesis_voice_name = 'nb-NO-HuldaRUS' # Creates an audio configuration that points to an audio file. file_object = tempfile.NamedTemporaryFile(suffix='.wav') audio_output = speechsdk.AudioOutputConfig( filename=file_object.name, ) # Creates a synthesizer with the given settings speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output) # Synthesizes the text to speech. result = speech_synthesizer.speak_text_async(text).get() # Checks result. if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized to [{}] for text [{}]".format(file_object.name, text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: if cancellation_details.error_details: print("Error details: {}".format(cancellation_details.error_details)) print("Did you update the subscription info?") # upload file gcs_key = f'audio_files/{time.time()}.wav' upload_blob('arabia', file_object.name, gcs_key) return f'https://arabia.storage.googleapis.com/{gcs_key}'
def translate(text, language): # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and service region (e.g., "westus"). speech_key, service_region = "51efd601e17f47d9bd3bf67bf81d0755", "westeurope" language = language_dict[language] speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_config.set_property( speechsdk.PropertyId.SpeechServiceConnection_SynthLanguage, language) speech_config.set_speech_synthesis_output_format( speechsdk.SpeechSynthesisOutputFormat.Audio16Khz128KBitRateMonoMp3) # Creates a speech synthesizer using the default speaker as audio output. speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config) # Synthesizes the received text to speech. # The synthesized speech is expected to be heard on the speaker with this line executed. result = speech_synthesizer.speak_text_async(text).get() # Checks result. if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized to speaker for text [{}]".format(text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: if cancellation_details.error_details: print("Error details: {}".format( cancellation_details.error_details)) print("Did you update the subscription info?") result = result.audio_data return result #print(translate("Hello world!", 'en'))
def speech_synthesis_with_auto_language_detection_to_speaker(): """performs speech synthesis to the default speaker with auto language detection Note: this is a preview feature, which might be updated in future versions.""" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # create the auto detection language configuration without specific languages auto_detect_source_language_config = \ speechsdk.languageconfig.AutoDetectSourceLanguageConfig() # Creates a speech synthesizer using the default speaker as audio output. speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, auto_detect_source_language_config=auto_detect_source_language_config) while True: # Receives a text from console input and synthesizes it to speaker. # For example, you can input "Bonjour le monde. Hello world.", then you will hear "Bonjour le monde." # spoken in a French voice and "Hello world." in an English voice. print( "Enter some multi lingual text that you want to speak, Ctrl-Z to exit" ) try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized to speaker for text [{}]".format(text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def process(self): logger.info('process:Enter') speech_key = model.key.AZURE_SPEECH_KEY service_region = model.key.AZURE_SERVICE_REGION speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region, speech_recognition_language="ja-JP") # setup the audio stream stream = speechsdk.audio.PushAudioInputStream( stream_format=speechsdk.audio.AudioStreamFormat( samples_per_second=16000, bits_per_sample=16)) audio_config = speechsdk.audio.AudioConfig(stream=stream) # instantiate the speech recognizer with push stream input speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_config) # Connect callbacks to the events fired by the speech recognizer def write_transcribed_data(evt): logger.debug(f'write_transcribed(text={evt.result.text})') self.transcript = evt.result.text speech_recognizer.recognizing.connect(write_transcribed_data) speech_recognizer.recognized.connect(write_transcribed_data) speech_recognizer.session_started.connect( lambda evt: logger.debug('SESSION STARTED: {}'.format(evt))) speech_recognizer.session_stopped.connect( lambda evt: logger.debug('SESSION STOPPED {}'.format(evt))) speech_recognizer.canceled.connect( lambda evt: logger.debug('CANCELED {}'.format(evt))) # start continuous speech recognition logger.info('start transcode') speech_recognizer.start_continuous_recognition() self.stream_generator(stream) speech_recognizer.stop_continuous_recognition() stream.close() logger.info('end transcode')
def processAudio(): API_KEY = os.getenv('API_KEY') REGION = os.getenv('REGION') speech_config = speechsdk.SpeechConfig(subscription=API_KEY, region=REGION) speech_config.request_word_level_timestamps() audio_input = speechsdk.AudioConfig(filename="converted.wav") speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input) done = False results = [] def stopCallBack(evt): nonlocal done done = True def getResults(evt): nonlocal results results.append(evt.result) # Connect callbacks to the events fired by the speech recognizer speech_recognizer.recognized.connect(getResults) speech_recognizer.session_stopped.connect(stopCallBack) speech_recognizer.canceled.connect(stopCallBack) speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) speech_recognizer.stop_continuous_recognition() return results
def get_response(): # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and service region (e.g., "westus"). speech_key, service_region = config.api_key, config.service_region speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a recognizer with the given settings speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config) print("Say something...") # Starts speech recognition, and returns after a single utterance is recognized. The end of a # single utterance is determined by listening for silence at the end or until a maximum of 15 # seconds of audio is processed. The task returns the recognition text as result. # Note: Since recognize_once() returns only a single utterance, it is suitable only for single # shot recognition like command or query. # For long-running multi-utterance recognition, use start_continuous_recognition() instead. result = speech_recognizer.recognize_once() # Checks result. if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) return result.text elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) return "nomatch" elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details)) return result.text
def speech_recognize_continuous_from_file(filepath): speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) audio_config = speechsdk.audio.AudioConfig(filename=filepath) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) global transcription transcription = '' def addToTranscription(text): global transcription if text == '': transcription = transcription + text else: transcription = transcription + ' ' + text done = False def stop_cb(evt): speech_recognizer.stop_continuous_recognition() nonlocal done done = True speech_recognizer.recognized.connect( lambda evt: addToTranscription(evt.result.text)) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) return transcription
def speech_synthesis_with_voice(): """performs speech synthesis to the default speaker with specified voice""" # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Sets the synthesis voice name. # e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". # The full list of supported voices can be found here: # https://aka.ms/csspeech/voicenames # And, you can try get_voices_async method to get all available voices (see speech_synthesis_get_available_voices() sample below). voice = "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)" speech_config.speech_synthesis_voice_name = voice # Creates a speech synthesizer for the specified voice, # using the default speaker as audio output. speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config) # Receives a text from console input and synthesizes it to speaker. while True: print("Enter some text that you want to speak, Ctrl-Z to exit") try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print( "Speech synthesized to speaker for text [{}] with voice [{}]". format(text, voice)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def speech_synthesis_to_mp3_file(): """performs speech synthesis to an mp3 file""" # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Sets the synthesis output format. # The full list of supported format can be found here: # https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs speech_config.set_speech_synthesis_output_format( speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3) # Creates a speech synthesizer using file as audio output. # Replace with your own audio file name. file_name = "outputaudio.mp3" file_config = speechsdk.audio.AudioOutputConfig(filename=file_name) speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=file_config) # Receives a text from console input and synthesizes it to mp3 file. while True: print("Enter some text that you want to synthesize, Ctrl-Z to exit") try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print( "Speech synthesized for text [{}], and the audio was saved to [{}]" .format(text, file_name)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def speech_recognize_continuous_from_file(): """performs continuous speech recognition with input from an audio file""" speech_key, service_region = "api_key", "region" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) audio_config = speechsdk.audio.AudioConfig( filename=globals()["audiofilename"]) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) done = False def stop_cb(evt): """callback that stops continuous recognition upon receiving an event `evt`""" #print('CLOSING on {}'.format(evt)) speech_recognizer.stop_continuous_recognition() nonlocal done done = True # Connect callbacks to the events fired by the speech recognizer #speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt))) globals()["all_results"] = [] def handle_final_result(evt): globals()["all_results"].append(evt.result.text) speech_recognizer.recognized.connect(handle_final_result) ''''#speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt))) #speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt))) #speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt))) #speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt))) # stop continuous recognition on either session stopped or canceled events''' speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5)
def recognize_intent_once_from_file(): """performs one-shot intent recognition from input from an audio file""" # <IntentRecognitionOnceWithFile> # Set up the config for the intent recognizer (remember that this uses the Language Understanding key, not the Speech Services key)! intent_config = speechsdk.SpeechConfig(subscription=intent_key, region=intent_service_region) audio_config = speechsdk.audio.AudioConfig(filename=lampfilename) # Set up the intent recognizer intent_recognizer = speechsdk.intent.IntentRecognizer(speech_config=intent_config, audio_config=audio_config) # set up the intents that are to be recognized. These can be a mix of simple phrases and # intents specified through a LanguageUnderstanding Model. model = speechsdk.intent.LanguageUnderstandingModel(app_id=language_understanding_app_id) intents = [ (model, "HomeAutomation.TurnOn"), (model, "HomeAutomation.TurnOff"), ("This is a test.", "test"), ("Switch the to channel 34.", "34"), ("what's the weather like", "weather"), ] intent_recognizer.add_intents(intents) # Run the intent recognizer. intent_result = intent_recognizer.recognize_once() # Check the results if intent_result.reason == speechsdk.ResultReason.RecognizedIntent: print("Recognized: \"{}\" with intent id `{}`".format(intent_result.text, intent_result.intent_id)) elif intent_result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(intent_result.text)) elif intent_result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format(intent_result.no_match_details)) elif intent_result.reason == speechsdk.ResultReason.Canceled: print("Translation canceled: {}".format(intent_result.cancellation_details.reason)) if intent_result.cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format(intent_result.cancellation_details.error_details))
def synthesize_translations(result): language_to_voice_map = { "de": "de-DE-KatjaNeural", "en": "en-US-AriaNeural", "it": "it-IT-ElsaNeural", "pt": "pt-BR-FranciscaNeural", "zh-Hans": "zh-CN-XiaoxiaoNeural" } print(f'Recognized: "{result.text}"') for language in result.translations: translation = result.translations[language] print(f'Translated into "{language}": {translation}') speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_config.speech_synthesis_voice_name = language_to_voice_map.get( language) audio_config = speechsdk.audio.AudioOutputConfig( filename=f'{language}-translation.wav') speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=audio_config) speech_synthesizer.speak_text_async(translation).get()
def llamada(): speech_key, service_region = "e21c5662cc5c4e7aa983ba12c67f6a90", "eastus" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="es-MX") print("Se ha iniciado la grabación de la llamada...") result = speech_recognizer.recognize_once() if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details)) #result = 2 return result.text
def speech2text(): speech_key, service_region = "30bf61e9604041eba9e79231abfa89af", "westus" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a recognizer with the given settings speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config) result = speech_recognizer.recognize_once() # Checks result. if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) return result.text elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format(result.no_match_details)) return 'error please try again' elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format(cancellation_details.error_details)) return 'error please try again'
def speech_recognize_continuous_from_file(filename, lang): """performs continuous speech recognition with input from an audio file""" speech_config = speechsdk.SpeechConfig( subscription=app.config['speech_key'], region=app.config['service_region']) speech_config.speech_recognition_language = lang #speech_config.request_word_level_timestamps() audio_config = speechsdk.audio.AudioConfig(filename=filename) speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) done = False all_res = [] def stop_cb(evt): """callback that stops continuous recognition upon receiving an event `evt`""" speech_recognizer.stop_continuous_recognition() nonlocal done done = True def handle_final_result(evt): """callback that handles continuous recognition results upon receiving an event `evt`""" all_res.append(evt.result.text) # Connect callbacks to the events fired by the speech recognizer speech_recognizer.recognized.connect(handle_final_result) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) return all_res
def speech_synthesis_to_speaker(): """performs speech synthesis to the default speaker""" # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a speech synthesizer using the default speaker as audio output. # The default spoken language is "en-us". speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config) # Receives a text from console input and synthesizes it to speaker. while True: print("Enter some text that you want to speak, Ctrl-Z to exit") try: text = input() except EOFError: break result = speech_synthesizer.speak_text_async(text).get() # Check result if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized to speaker for text [{}]".format(text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format(cancellation_details.error_details))
def speech_synthesis_with_voice(language, gender, text_to_speech): if language.upper() == "ES": language = "spanish" elif language.upper() == "EN": language = "english" gender = gender.lower() gender_list = { "spanish": { "male": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)", "female": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)" }, "english": { "male": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)" } } random_file_name = str(random.choice(range(1, 5000))) + ".mp3" file_config = speechsdk.audio.AudioOutputConfig( filename=f"{random_file_name}") speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) voice = gender_list[language][gender] speech_config.speech_synthesis_voice_name = voice #speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz128KBitRateMonoMp3) speech_config.set_speech_synthesis_output_format( SpeechSynthesisOutputFormat["Riff24Khz16BitMonoPcm"]) speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=speech_config, audio_config=file_config) speech_synthesizer.speak_text_async(text_to_speech).get() return random_file_name
def setup(self): """gives an example how to use a push audio stream to recognize speech from a custom audio source""" CHUNKSIZE = 1024 SAMPLE_WIDTH = 2 RATE = 16000 CHANNELS = 1 speech_config = speechsdk.SpeechConfig( subscription=self.speech_key, region=self.service_region, speech_recognition_language=self.language) # setup the audio stream self.asr_stream = speechsdk.audio.PushAudioInputStream() audio_config = speechsdk.audio.AudioConfig(stream=self.asr_stream) # instantiate the speech recognizer with push stream input self.speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_config) # Connect callbacks to the events fired by the speech recognizer self.speech_recognizer.recognizing.connect( lambda evt: self.result_callback('RECOGNIZING', evt)) self.speech_recognizer.recognized.connect( lambda evt: self.result_callback('RECOGNIZED', evt)) self.speech_recognizer.session_started.connect( lambda evt: print('SESSION STARTED: {}'.format(evt))) self.speech_recognizer.session_stopped.connect( lambda evt: self.stop_callback('SESSION STOPPED', evt)) self.speech_recognizer.canceled.connect( lambda evt: self.stop_callback('CANCELED', evt)) self.speech_recognizer.start_continuous_recognition() t = threading.Thread(target=self._generator) t.start()
def SpeechRecog(): speech_key, service_region = "speech_key", "service_region" weatherfilename = "recorded.wav" speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename) all_results = [] def handle_final_result(a): all_results.append(a) # Creates a recognizer with the given settings speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) speech_recognizer.session_started.connect( lambda evt: print('SESSION STARTED: {}'.format(evt))) speech_recognizer.session_stopped.connect( lambda evt: print('\nSESSION STOPPED {}'.format(evt))) speech_recognizer.recognized.connect( lambda evt: handle_final_result(evt.result.text)) print(all_results) # print('Say a few words\n\n') speech_recognizer.start_continuous_recognition() time.sleep(30) speech_recognizer.stop_continuous_recognition() speech_recognizer.session_started.disconnect_all() speech_recognizer.recognized.disconnect_all() speech_recognizer.session_stopped.disconnect_all() all_results with open('your_file.txt', 'w') as f: for item in all_results: f.write("%s\n" % item)
def main(): try: global speech_config # Get Configuration Settings load_dotenv() cog_key = os.getenv('COG_SERVICE_KEY') cog_region = os.getenv('COG_SERVICE_REGION') # Configure speech service speech_config = speech_sdk.SpeechConfig(cog_key, cog_region) print('Ready to use Pedalboard DAW:', speech_config.region) # Get raw audio file and sample rate audio_file, sample_rate = sf.read('Bass.wav') # Make a Pedalboard object, containing multiple plugins: board = Pedalboard( [ #Compressor(threshold_db=-25, ratio=10), #Limiter(), ], sample_rate=sample_rate) # Get user input command = '' while command != 'quit session.': command = transcribe_command().lower() if command != 'quit session.': execute_command(command, board, audio_file, sample_rate) else: command = 'quit session.' except Exception as ex: print(ex)
def __init__(self, key=CONFIGS["stt_key"], region=CONFIGS["service_region"]): self.speech_config = speechsdk.SpeechConfig(subscription=key, region=region) # setup the audio stream self.stream = speechsdk.audio.PushAudioInputStream() audio_config = speechsdk.audio.AudioConfig(stream=self.stream) self._reset() # instantiate the speech recognizer with push stream input self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=audio_config) # Connect callbacks to the events fired by the speech recognizer #speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt))) def recognized_handler(evt): print('RECOGNIZED: {}'.format(evt.result.text)) self.recognized = True self.recognized_text = evt.result.text self.speech_recognizer.recognized.connect(recognized_handler) self.speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt))) self.speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt))) self.speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt))) self.speech_recognizer.start_continuous_recognition()
def speech_recognize_once_from_file_with_customized_model(): """performs one-shot speech recognition with input from an audio file, specifying a custom model""" # <SpeechRecognitionUsingCustomizedModel> speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Set the endpoint ID of your customized model # Replace with your own CRIS endpoint ID. speech_config.endpoint_id = "YourEndpointId" audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename) # Creates a speech recognizer using a file as audio input. # The default language is "en-us". speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) # Perform recognition. `recognize_once` blocks until an utterance has been recognized, after # which recognition stops and a result is returned. Thus, it is suitable only for single shot # recognition like command or query. For long-running recognition, use continuous recognitions # instead. result = speech_recognizer.recognize_once() # Check the result if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized: {}".format( result.no_match_details)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format( cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format( cancellation_details.error_details))
def speech_recognize_once_from_mic(): """performs one-shot speech recognition from the default microphone""" # <SpeechRecognitionWithMicrophone> speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Creates a speech recognizer using microphone as audio input. # The default language is "en-us". speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config) # Perform recognition. `recognize_once` blocks until an utterance has been recognized, after # which recognition stops and a result is returned. Thus, it is suitable only for single shot # recognition like command or query. For long-running recognition, use continuous recognitions # instead. result = speech_recognizer.recognize_once() # Check the result if result.reason == speechsdk.ResultReason.RecognizedSpeech: print("Recognized: {}".format(result.text)) elif result.reason == speechsdk.ResultReason.NoMatch: print("No speech could be recognized") elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech Recognition canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format(cancellation_details.error_details))
import os sys.path.append('transcript/ghostvlad') import model as spkModel import toolkits import azure.cognitiveservices.speech as speechsdk from pydub import AudioSegment import numpy as np import uisrnn import librosa # sys.path.append('visualization') #from viewer import PlotDiar # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and region. speech_key, service_region = "3021013d1649482f91008c7df0a0d971", "centralindia" speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region) def pipeline(audio): timestamps = dia_audio(audio) output = asr(audio, timestamps) return output """A demo script showing how to DIARIZATION ON WAV USING UIS-RNN.""" # =========================================== # Parse the argument # ===========================================