def listen_for_text(context="none"): global stop_now, text, q, service, audio_source stop_now = False # print("1. Context is "+context) iam_apikey = os.environ['IAM_APIKEY'] service = SpeechToTextV1( url='https://gateway-lon.watsonplatform.net/speech-to-text/api', iam_apikey=iam_apikey) CHUNK = 1024 BUF_MAX_SIZE = CHUNK * 10 q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) audio_source = AudioSource(q, True, True) FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False, input_device_index=2) stream.start_stream() recognize_thread = Thread(target=recognize_using_websocket, args=()) recognize_thread.start() while not stop_now: pass stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording() return text
def watson_streaming_stt(filename: str, lang: str, encoding: str) -> str: authenticator = IAMAuthenticator(WATSON_API_KEY) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url(WATSON_STT_URL) # Make watson audio source fed by a buffer queue buffer_queue = Queue(maxsize=BUFFER_MAX_ELEMENT) audio_source = AudioSource(buffer_queue, True, True) # Callback object mycallback = MyRecognizeCallback() # Read the file buffer, rate = read_wav_file(filename) # Start Speech-to-Text recognition thread stt_stream_thread = Thread(target=speech_to_text.recognize_using_websocket, kwargs={ 'audio': audio_source, 'content_type': 'audio/l16; rate={}'.format(rate), 'recognize_callback': mycallback, 'interim_results': True }) stt_stream_thread.start() # Simulation audio stream by breaking file into chunks and filling buffer queue audio_generator = simulate_stream(buffer, CHUNK_SIZE) for chunk in audio_generator: buffer_queue.put(chunk) time.sleep(0.5) # give a chance to callback # Close the audio feed and wait for STTT thread to complete audio_source.completed_recording() stt_stream_thread.join() # send final result return mycallback.transcript
class WatsonRecognizer: # Note: It will discard if the websocket client can't consume fast enough # So, increase the max size as per your choice CHUNK = 1024 BUF_MAX_SIZE = CHUNK * 10 # Variables for recording the speech FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 def __init__(self): # Buffer to store audio self.q = Queue(maxsize=self.BUF_MAX_SIZE) self.transcription_q = Queue() self.audio_source = AudioSource(self.q, True, True) self.callback = WatsonCallback(transcript_q=self.transcription_q, prints=True) # initialize speech to text service self.authenticator = IAMAuthenticator('zPJij17cD8uAVUsaWqRgZPyGt9CH5q8XuwNGurfFhtXW') self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) # instantiate audio self.audio = pyaudio.PyAudio() # open stream using callback self.stream = self.audio.open( format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK, stream_callback=self.pyaudio_callback, start=False ) # thread for the speech recognition self.thread = Process(target=self.speech_to_text.recognize_using_websocket, kwargs={ "audio": self.audio_source, "content_type": "audio/l16; rate=44100", "recognize_callback": self.callback, "interim_results": True}) def pyaudio_callback(self, in_data, frame_count, time_info, status): try: self.q.put(in_data) except Full: pass # discard return None, pyaudio.paContinue def start(self): if not self.running: self.stream.start_stream() self.thread.start() def stop(self, timeout=20): if self.running: self.stream.stop_stream() self.thread.terminate() self.thread.join(timeout=timeout) def close(self, timeout=20): self.thread.terminate() self.thread.join(timeout=timeout) self.stream.stop_stream() self.stream.close() self.audio.terminate() self.audio_source.completed_recording() @property def running(self): return self.thread.is_alive()
audio = pyaudio.PyAudio() # open stream using callback stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False) ######################################################################### #### Start the recording and start service to recognize the stream ###### ######################################################################### print("Enter CTRL+C to end recording...") stream.start_stream() try: recognize_thread = Thread(target=recognize_using_weboscket, args=()) recognize_thread.start() while True: pass except KeyboardInterrupt: # stop recording stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording()
def start_stt(): mic_photo_on = PhotoImage(file=r"images/mic-on-50.png") mic_on = Button(window, text="Mic", image=mic_photo_on, background="white", activebackground="white", border=0, command=end_stt) mic_on.place(x=900, y=148) # mic_off.grid() # import watson_tts # def quit(): # return # def stop_watson(): # return # window.bind('<Control-c>', stop_watson) def space_break(): keyboard = Controller() # keyboard.press(Key.pause) # keyboard.release(Key.pause) with keyboard.pressed(Key.control): keyboard.press('c') keyboard.release('c') window.bind('<space>', space_break) try: from Queue import Queue, Full except ImportError: from queue import Queue, Full ############################################### #### Initalize queue / the thing to store the audio recordings ## ############################################### # CHUNK = 1024 CHUNK = 1500 # *** if the websocket client isn't fast enough it will just discard # *** if that happens it said to try using a larger max size BUF_MAX_SIZE = CHUNK * 20 # Buffer to store audio q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) # Create an instance of AudioSource audio_source = AudioSource(q, True, True) ############################################### #### Prepare Speech to Text Service ######## ############################################### # initialize speech to text service speech_to_text = SpeechToTextV1( iam_apikey="ZAM8vwm2g3Dsnh1UPjOqyI-PloGvZ-PjSEAbjT_JHk1s", url="https://gateway-wdc.watsonplatform.net/speech-to-text/api") # define callback for the speech to text service class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_transcription(self, transcript): print(transcript) # status = "Translating..." def on_connected(self): print('Connection was successful') # status = "Connected" def on_error(self, error): print('Error received: {}'.format(error)) # status = "Error" def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) # status = "Timeout" def on_listening(self): print('Service is listening') # status = "Listening..." def on_hypothesis(self, hypothesis): print(hypothesis) # return def on_data(self, data): print(data) # text_translation = data # header.configure(text=text_translation) # return def on_close(self): print("Connection closed") # status = "Listening stopped" # this function will initiate the recognize service and pass in the AudioSource def recognize_using_weboscket(*args): mycallback = MyRecognizeCallback() speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/l16; rate=44100', recognize_callback=mycallback, interim_results=True) ############################################### #### Prepare the for recording using Pyaudio ## ############################################### # Variables for recording the speech FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 # define callback for pyaudio to store the recording in queue def pyaudio_callback(in_data, frame_count, time_info, status): try: q.put(in_data) except Full: pass # discard return (None, pyaudio.paContinue) # instantiate pyaudio audio = pyaudio.PyAudio() # open stream using callback stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False) ######################################################################### #### Start the recording and start service to recognize the stream ###### ######################################################################### print("Enter CTRL+C to end recording...") stream.start_stream() try: recognize_thread = Thread(target=recognize_using_weboscket, args=()) recognize_thread.start() while True: pass except KeyboardInterrupt: # stop recording recognize_thread.stop() stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording()
def listen_to_mic(api_key, service_url): q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) audio_source = AudioSource(q, is_recording=True, is_buffer=True) # Prepare Speech to Text Service # initialize speech to text service authenticator = IAMAuthenticator(apikey=api_key) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url(service_url) # define callback for the speech to text service class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_transcription(self, transcript): print(transcript[0]['transcript']) def on_connected(self): print('Connection was successful') def on_error(self, error): print('Error received: {}'.format(error)) def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) def on_listening(self): print('Service is listening') def on_hypothesis(self, hypothesis): pass # print(hypothesis) def on_data(self, data): pass # print(data) def on_close(self): print("Connection closed") # this function will initiate the recognize service and pass in the AudioSource def recognize_using_websocket(*args): mycallback = MyRecognizeCallback() if FORMAT == pyaudio.paInt16: content_type = f"audio/l16; rate={RATE}" else: raise NotImplementedError( "only pyaudio.paInt16 format is supported") speech_to_text.recognize_using_websocket(audio=audio_source, content_type=content_type, recognize_callback=mycallback, interim_results=True) # Prepare the for recording using Pyaudio # define callback for pyaudio to store the recording in queue def pyaudio_callback(in_data, frame_count, time_info, status): try: q.put(in_data) except Full: pass # discard return None, pyaudio.paContinue # instantiate pyaudio audio = pyaudio.PyAudio() # open stream using callback stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=False, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False) # Start the recording and start service to recognize the stream print("Enter CTRL+C to end recording...") stream.start_stream() try: recognize_thread = Thread(target=recognize_using_websocket, args=()) recognize_thread.start() while True: pass except KeyboardInterrupt: # stop recording stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording()
class ibmTranscribe: def __init__(self, audio_device): self.is_supported = is_supported if not self.is_supported: return self.audio_device = audio_device APIKEY = None URL = None with open(speakreader.CONFIG.IBM_CREDENTIALS_FILE) as f: for line in f.read().splitlines(): parm = line.split('=') if parm[0] == 'SPEECH_TO_TEXT_APIKEY': APIKEY = parm[1] if parm[0] == 'SPEECH_TO_TEXT_URL': URL = parm[1] if APIKEY is None or URL is None: logger.warn('ibmTranscribe: APIKEY or URL not found in credentials file') # initialize speech to text service self.authenticator = IAMAuthenticator(APIKEY) self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) self.speech_to_text.set_service_url(URL) self.mycallback = ProcessResponses() self.audio_source = AudioSource(audio_device._streamBuff, is_recording=True, is_buffer=True) def transcribe(self): if not self.is_supported: return # Generator to return transcription results logger.debug('ibmTranscribe.transcribe ENTER') recognize_thread = Thread(target=self.recognize_using_websocket, args=()) recognize_thread.start() while True: response = self.mycallback.responseQueue.get() if response is None: break yield response self.audio_source.completed_recording() recognize_thread.join() logger.debug('ibmTranscribe.transcribe EXIT') # this function will initiate the recognize service and pass in the AudioSource def recognize_using_websocket(self, *args): logger.debug("ibmTransribe.recognize_using_websocket ENTER") self.speech_to_text.recognize_using_websocket( audio=self.audio_source, content_type='audio/l16; rate=%s' % self.audio_device._outputSampleRate, recognize_callback=self.mycallback, interim_results=True, max_alternatives=1, inactivity_timeout=-1, smart_formatting=True, word_alternatives_threshold=0.75, profanity_filter=bool(speakreader.CONFIG.ENABLE_CENSORSHIP), ) logger.debug("ibmTransribe.recognize_using_websocket EXIT")