def listen_for_text(context="none"): global stop_now, text, q, service, audio_source stop_now = False # print("1. Context is "+context) iam_apikey = os.environ['IAM_APIKEY'] service = SpeechToTextV1( url='https://gateway-lon.watsonplatform.net/speech-to-text/api', iam_apikey=iam_apikey) CHUNK = 1024 BUF_MAX_SIZE = CHUNK * 10 q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) audio_source = AudioSource(q, True, True) FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False, input_device_index=2) stream.start_stream() recognize_thread = Thread(target=recognize_using_websocket, args=()) recognize_thread.start() while not stop_now: pass stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording() return text
def __init__(self, audio_device): self.is_supported = is_supported if not self.is_supported: return self.audio_device = audio_device APIKEY = None URL = None with open(speakreader.CONFIG.IBM_CREDENTIALS_FILE) as f: for line in f.read().splitlines(): parm = line.split('=') if parm[0] == 'SPEECH_TO_TEXT_APIKEY': APIKEY = parm[1] if parm[0] == 'SPEECH_TO_TEXT_URL': URL = parm[1] if APIKEY is None or URL is None: logger.warn('ibmTranscribe: APIKEY or URL not found in credentials file') # initialize speech to text service self.authenticator = IAMAuthenticator(APIKEY) self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) self.speech_to_text.set_service_url(URL) self.mycallback = ProcessResponses() self.audio_source = AudioSource(audio_device._streamBuff, is_recording=True, is_buffer=True)
def __init__(self): # Buffer to store audio self.q = Queue(maxsize=self.BUF_MAX_SIZE) self.transcription_q = Queue() self.audio_source = AudioSource(self.q, True, True) self.callback = WatsonCallback(transcript_q=self.transcription_q, prints=True) # initialize speech to text service self.authenticator = IAMAuthenticator('zPJij17cD8uAVUsaWqRgZPyGt9CH5q8XuwNGurfFhtXW') self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) # instantiate audio self.audio = pyaudio.PyAudio() # open stream using callback self.stream = self.audio.open( format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK, stream_callback=self.pyaudio_callback, start=False ) # thread for the speech recognition self.thread = Process(target=self.speech_to_text.recognize_using_websocket, kwargs={ "audio": self.audio_source, "content_type": "audio/l16; rate=44100", "recognize_callback": self.callback, "interim_results": True})
def landing_audio(request): speech_to_text = SpeechToTextV1( iam_apikey='', url='https://stream.watsonplatform.net/speech-to-text/api') class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_data(self, data): print(json.dumps(data, indent=2)) def on_error(self, error): print('Error received: {}'.format(error)) def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) myRecognizeCallback = MyRecognizeCallback() with open(join(dirname(__file__), '01_denunciante.ogg'), 'rb') as audio_file: audio_source = AudioSource(audio_file) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/ogg;codecs=vorbis', recognize_callback=myRecognizeCallback, model='es-ES_NarrowbandModel', max_alternatives=1) context = {'data': "Hello"} return render(request, 'home/landing.html', context)
def ibm_recog(self, audioname, audiofp): authenticator = IAMAuthenticator( '6noBhxJHkbRVsgbxsl47v6dFZnJdoRRrDRYte7GgKKxu') speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/51085e72-7959-4c18-94cd-d4d874baf61d' ) myRecognizeCallback = MyRecognizeCallback() ts = [] c = [] with open(join(dirname(audioname), audiofp), 'rb') as audio_file: audio_source = AudioSource(audio_file) x = speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/mp3', inactivity_timeout=-1, recognize_callback=myRecognizeCallback, model='en-US_BroadbandModel', timestamps=True, smart_formatting=True, ) for r in result: alternatives = r.get('alternatives') ts.append(alternatives[0].get('timestamps')) timestamps = [elem for twod in ts for elem in twod] c.append(alternatives[0].get('confidence')) confidence = sum(c) / len(c) a, sr = open_audio(audiofp) self.initAudio(a, sr) self.setupIBM(timestamps, confidence) self.audiofp = audiofp
def test_recognize_using_websocket(self): class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) self.error = None self.transcript = None def on_error(self, error): self.error = error def on_transcription(self, transcript): self.transcript = transcript test_callback = MyRecognizeCallback() with open( os.path.join(os.path.dirname(__file__), '../../resources/speech.wav'), 'rb') as audio_file: audio_source = AudioSource(audio_file, False) t = threading.Thread( target=self.speech_to_text.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", test_callback)) t.start() t.join() assert test_callback.error is None assert test_callback.transcript is not None assert test_callback.transcript[0][ 'transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '
def test_on_transcription_interim_results_true_low_latency_false(self): class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) self.error = None self.transcript = None def on_error(self, error): self.error = error def on_transcription(self, transcript): self.transcript = transcript assert transcript[0]['confidence'] is not None assert transcript[0]['transcript'] is not None test_callback = MyRecognizeCallback() with open( os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file: audio_source = AudioSource(audio_file, False) self.speech_to_text.recognize_using_websocket( audio_source, "audio/wav", test_callback, model="en-US_Telephony", interim_results=True, low_latency=False) assert test_callback.error is None assert test_callback.transcript is not None assert test_callback.transcript[0][ 'transcript'] == 'and heavy rain '
def __init__(self): global QUEUE QUEUE = self.createQueue() self.AUDIO = pyaudio.PyAudio() self.AUDIO_SOURCE = AudioSource(QUEUE, is_recording=True, is_buffer=True) self.MYCALLBACK = MyRecognizeCallback() self.SPEECHSERVICE = initSpeechText()
def recognize(self, audio_data): flac_data = AudioSource(audio_data.get_flac_data()) speech_to_text.recognize_using_websocket( audio=flac_data, content_type='audio/flac', recognize_callback=myRecognizeCallback, model='en-US_BroadbandModel', keywords=['colorado', 'tornado', 'tornadoes'], keywords_threshold=0.5, max_alternatives=3)
def watson_streaming_stt(filename: str, lang: str, encoding: str) -> str: authenticator = IAMAuthenticator(WATSON_API_KEY) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url(WATSON_STT_URL) # Make watson audio source fed by a buffer queue buffer_queue = Queue(maxsize=BUFFER_MAX_ELEMENT) audio_source = AudioSource(buffer_queue, True, True) # Callback object mycallback = MyRecognizeCallback() # Read the file buffer, rate = read_wav_file(filename) # Start Speech-to-Text recognition thread stt_stream_thread = Thread(target=speech_to_text.recognize_using_websocket, kwargs={ 'audio': audio_source, 'content_type': 'audio/l16; rate={}'.format(rate), 'recognize_callback': mycallback, 'interim_results': True }) stt_stream_thread.start() # Simulation audio stream by breaking file into chunks and filling buffer queue audio_generator = simulate_stream(buffer, CHUNK_SIZE) for chunk in audio_generator: buffer_queue.put(chunk) time.sleep(0.5) # give a chance to callback # Close the audio feed and wait for STTT thread to complete audio_source.completed_recording() stt_stream_thread.join() # send final result return mycallback.transcript
def audioFile_to_queue(self, fileToRecognize): #open audio file and STT it: with open(join(dirname('__file__'), fileToRecognize), 'rb') as audio_file: self.service.recognize_using_websocket( AudioSource(audio_file), 'audio/wav', self.audio_callback, model='en-US_NarrowbandModel', continuous=True)
def sst_response(audio_pathfile, speech_to_text, keywords, custom_id): """Return callback response of SST using one audiofile.""" my_recognize_callback = MyRecognizeCallback() with open((audio_pathfile), 'rb') as audio_file: audio_source = AudioSource(audio_file) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/mp3', recognize_callback=my_recognize_callback, model='es-CO_NarrowbandModel', language_customization_id=custom_id, keywords=keywords, keywords_threshold=0.5, speaker_labels=True)
def recognize(self, archive_audio): try: return self.speech_to_text.recognize_using_websocket( audio=AudioSource(archive_audio, True, True), content_type='audio/webm', model='pt-BR_BroadbandModel', interim_results=False ) except ApiException as ex: return format_msg( Response.ERROR_SERVER, ex.message, ex.code )
def parse_audio(path): audio = path + '/recording.mp3' print audio CHUNK = 1024 # Note: It will discard if the websocket client can't consumme fast enough # So, increase the max size as per your choice BUF_MAX_SIZE = CHUNK * 10 speech_to_text = SpeechToTextV1( iam_apikey='9e0ri-mtT_R8DicTjLTNkRe9T1WJFxHdkFBYobAmlxp2', url= 'https://gateway-wdc.watsonplatform.net/speech-to-text/api/v1/recognize' ) speech_to_text.disable_SSL_verification() jsonresult = "" q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_data(self, data): q.put(data) def on_error(self, error): print('Error received: {}'.format(error)) def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) myRecognizeCallback = MyRecognizeCallback() #read input audio file with open(audio, 'rb') as audio_file: audio_source = AudioSource(audio_file) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/mp3', recognize_callback=myRecognizeCallback, model='en-US_BroadbandModel', speaker_labels=True) # write to raw transcript with open(path + '/sample.json', 'w+') as f: while not q.empty(): f.write(json.dumps(q.get())) return list(q.queue)
def traducirVozaTexto(): myRecognizeCallback = MyRecognizeCallback() with open('salser.wav', 'rb') as audio_file: audio_source = AudioSource(audio_file) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/wav', recognize_callback=myRecognizeCallback, model='es-MX_NarrowbandModel', keywords=["tiene"], keywords_threshold=0.5, max_alternatives=1) print(speech_to_text.get_model(0))
def main(): rospy.init_node('s2t_rt', anonymous=True) # Get parameters input_topic = rospy.get_param('~input_topic') credentials_path = rospy.get_param('~credentials_path') format = rospy.get_param('~format', 'PCM') # Get credentials with open(credentials_path) as cf: credentials = yaml.safe_load(cf) speech_to_text = SpeechToTextV1(iam_apikey=credentials['apikey'], url=credentials['url']) queue = Queue(maxsize=10) audio_source = AudioSource(queue, is_recording=True, is_buffer=True) recognize_callback = MyRecognizeCallback('~transcript', '~interim') msg = rospy.wait_for_message(input_topic, AudioData) if format == 'FLAC': content_type = 'audio/flac' else: # Get content type from message endianness = 'big-endian' if msg.is_bigendian else 'little-endian' content_type = """audio/l16; rate={}; channels={}; endianness={}""".format(msg.sample_rate, msg.num_channels, endianness) recognizer = speech_to_text.recognize_using_websocket( audio=audio_source, content_type=content_type, recognize_callback=recognize_callback, interim_results=True, inactivity_timeout=-1) recognize_thread = threading.Thread(target=recognizer.start, args=()) recognize_thread.daemon = True recognize_thread.start() def callback(msg): if format == 'FLAC': dtype = width_to_dtype(msg.sample_width) data = np.fromstring(msg.data, dtype) with io.BytesIO() as flac_file: sf.write(flac_file, data, msg.sample_rate, format=format) queue.put(str(ogg_file.getvalue())) else: queue.put(str(msg.data)) rospy.Subscriber(input_topic, AudioData, callback) rospy.spin() recognizer.close()
def speech_to_text(audio_file, source_lang="en-US_BroadbandModel"): audio_path = "{}/{}".format(THIS_PATH, audio_file) print("transcribing from: {}".format(audio_path)) callbacks = TranscriberCallbacks() with open(audio_path, "rb") as f: audio_source = AudioSource(f) speech2txt.recognize_using_websocket(audio=audio_source, content_type="audio/flac", recognize_callback=callbacks, model=source_lang) print("transcript: {}\n".format(callbacks.transcript)) return callbacks.transcript
def RecognizeAudio(self): RecognizeCallback = FinishRecognize() authenticator = IAMAuthenticator(apikey) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/edf44363-198b-489f-9aa8-a320cd094d65' ) with open('payload.mp3', 'rb') as audio: audio_source = AudioSource(audio) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/mpeg', recognize_callback=RecognizeCallback, model='en-US_BroadbandModel', keywords=['verdict', 'is', 'hot'], keywords_threshold=0, max_alternatives=3)
def stt(self, target_dir='stt_results/'): print('\n--- START STT ---\n') result_filename = target_dir + self.filename + '.json' if os.path.exists(result_filename): print('ファイルあったのん!') with open(result_filename, encoding="shift_jis") as f: df = json.load(f) self.set_result(df) else: with open(self.filepath, 'rb') as audio_file: audio_source = AudioSource(audio_file) self.speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/' + self.fileext[1:], # 拡張子前の '.' 除去 recognize_callback=self.rCallback, model='ja-JP_BroadbandModel', max_alternatives=1)
def speechtotext(languagemodel): authenticator = IAMAuthenticator( '6vGtDAgc9UpxbdGX5x00ULZOAdV2U_Jaz1CE0T6_sdpu') speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/a2cc1293-2cef-4b7b-90b1-ae97d16b3081' ) myRecognizeCallback = MyRecognizeCallback() with open('./output/testvideo/vocals.wav', 'rb') as audio_fill: audio_sauce = AudioSource(audio_fill) speech_to_text.recognize_using_websocket( audio=audio_sauce, content_type='audio/wav', recognize_callback=myRecognizeCallback, model=languagemodel, keywords=['colorado', 'tornado', 'tornadoes'], keywords_threshold=0.5, max_alternatives=1)
def speech_to_text(file_path): authenticator = IAMAuthenticator('{Your key}') speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.kr-seo.speech-to-text.watson.cloud.ibm.com/instances/c2523ff6-bb4f-4d41-9134-a2327a107b75/v1/recognize' ) myRecognizeCallback = MyRecognizeCallback() with open(file_path, 'rb') as audio_file: audio_source = AudioSource(audio_file) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/wav', recognize_callback=myRecognizeCallback, model='en-US_BroadbandModel', max_alternatives=1) return myRecognizeCallback.transcript
def RecognizeAudio(self): RecognizeCallback = FinishRecognize() authenticator = IAMAuthenticator(apikey) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url( 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/edf44363-198b-489f-9aa8-a320cd094d65' ) with open('payload.wav', 'rb') as audio: audio_source = AudioSource(audio) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/wav', recognize_callback=RecognizeCallback, model='en-US_NarrowbandModel', keywords=[ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero' ], keywords_threshold=1, max_alternatives=3)
# initialize speech to text service API_KEY = 'd_vI7npJhICly_5HOdyLYJYVlXU0QnCQOiSxjNil6qdl' API_URL = 'https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/eb505cb9-2feb-484c-ba93-7af0539d6dd7' authenticator = IAMAuthenticator(API_KEY) speech_to_text = SpeechToTextV1(authenticator=authenticator) #initalize queue to store the recordings ## CHUNK = 1024 #Note: It will discard if the websocket client can't consumme fast enough #So, increase the max size as per your choice BUF_MAX_SIZE = CHUNK * 10 #buffer to store audio q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) #create an instance of AudioSource audio_source = AudioSource(q, True, True) #translator translator = Translator(to_lang="spanish") #report file REPORT_FILENAME = "report.txt" #global disfluencyCount = 0 captureText = '' translatedText = '' realtimeText = '' grade = 100 isStarted = False mood = ''
def stt(): """ Speech To Text core :return: """ read_audio = PyAudio() stream = read_audio.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, ) print("Listening...") received = b'' voice = b'' rel = int(RATE / BUFFER) silence = deque(maxlen=SILENCE * rel) prev_audio = b''[:int(rel / 2)] started = False n = 1 # depricated, but still might work! Change value for n of pauses you will make while n > 0: current_data = stream.read(BUFFER) # print(current_data) # use for debug! silence.append(sqrt(abs(avg(current_data, 4)))) if sum([x > THRESHOLD for x in silence]) > 0: if not started: print("Recording...") started = True voice += current_data elif started is True: received = voice started = False silence = deque(maxlen=SILENCE * rel) prev_audio = b''[:int(rel / 2)] voice = b'' n -= 1 else: prev_audio += current_data print("Processing...") final = b'RIFF\xff\xff\xff\xffWAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00"V' \ b'\x00\x00D\xac\x00\x00\x02\x00\x10\x00LIST\x1a\x00\x00\x00INFOISFT' \ b'\x0e\x00\x00\x00Lavf58.29.100\x00data' + received received_data = BytesIO(final) class MyRecognizeCallback(RecognizeCallback): """ Callback class from Watson """ def __init__(self): RecognizeCallback.__init__(self) self.result = '' self.on_error( 'Couldn\'t hear what you said. Please try again later') def on_data(self, data): """ If the voice is recognised :param data: """ self.result = data['results'][0]['alternatives'][0]['transcript'] def on_error(self, error): """ If error occurs or the voice is not recognised :param error: """ self.result = 'Error received: {}'.format(error) my_recognize_callback = MyRecognizeCallback() audio_source = AudioSource(received_data) speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/wav', recognize_callback=my_recognize_callback, model='en-US_BroadbandModel') received_data.close() stream.stop_stream() stream.close() read_audio.terminate() print('WARVIS recognised:\n"{}"'.format( my_recognize_callback.result.strip())) return my_recognize_callback.result
def listen_to_mic(api_key, service_url): q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) audio_source = AudioSource(q, is_recording=True, is_buffer=True) # Prepare Speech to Text Service # initialize speech to text service authenticator = IAMAuthenticator(apikey=api_key) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url(service_url) # define callback for the speech to text service class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_transcription(self, transcript): print(transcript[0]['transcript']) def on_connected(self): print('Connection was successful') def on_error(self, error): print('Error received: {}'.format(error)) def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) def on_listening(self): print('Service is listening') def on_hypothesis(self, hypothesis): pass # print(hypothesis) def on_data(self, data): pass # print(data) def on_close(self): print("Connection closed") # this function will initiate the recognize service and pass in the AudioSource def recognize_using_websocket(*args): mycallback = MyRecognizeCallback() if FORMAT == pyaudio.paInt16: content_type = f"audio/l16; rate={RATE}" else: raise NotImplementedError( "only pyaudio.paInt16 format is supported") speech_to_text.recognize_using_websocket(audio=audio_source, content_type=content_type, recognize_callback=mycallback, interim_results=True) # Prepare the for recording using Pyaudio # define callback for pyaudio to store the recording in queue def pyaudio_callback(in_data, frame_count, time_info, status): try: q.put(in_data) except Full: pass # discard return None, pyaudio.paContinue # instantiate pyaudio audio = pyaudio.PyAudio() # open stream using callback stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=False, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False) # Start the recording and start service to recognize the stream print("Enter CTRL+C to end recording...") stream.start_stream() try: recognize_thread = Thread(target=recognize_using_websocket, args=()) recognize_thread.start() while True: pass except KeyboardInterrupt: # stop recording stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording()
class WatsonRecognizer: # Note: It will discard if the websocket client can't consume fast enough # So, increase the max size as per your choice CHUNK = 1024 BUF_MAX_SIZE = CHUNK * 10 # Variables for recording the speech FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 def __init__(self): # Buffer to store audio self.q = Queue(maxsize=self.BUF_MAX_SIZE) self.transcription_q = Queue() self.audio_source = AudioSource(self.q, True, True) self.callback = WatsonCallback(transcript_q=self.transcription_q, prints=True) # initialize speech to text service self.authenticator = IAMAuthenticator('zPJij17cD8uAVUsaWqRgZPyGt9CH5q8XuwNGurfFhtXW') self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) # instantiate audio self.audio = pyaudio.PyAudio() # open stream using callback self.stream = self.audio.open( format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK, stream_callback=self.pyaudio_callback, start=False ) # thread for the speech recognition self.thread = Process(target=self.speech_to_text.recognize_using_websocket, kwargs={ "audio": self.audio_source, "content_type": "audio/l16; rate=44100", "recognize_callback": self.callback, "interim_results": True}) def pyaudio_callback(self, in_data, frame_count, time_info, status): try: self.q.put(in_data) except Full: pass # discard return None, pyaudio.paContinue def start(self): if not self.running: self.stream.start_stream() self.thread.start() def stop(self, timeout=20): if self.running: self.stream.stop_stream() self.thread.terminate() self.thread.join(timeout=timeout) def close(self, timeout=20): self.thread.terminate() self.thread.join(timeout=timeout) self.stream.stop_stream() self.stream.close() self.audio.terminate() self.audio_source.completed_recording() @property def running(self): return self.thread.is_alive()
class ibmTranscribe: def __init__(self, audio_device): self.is_supported = is_supported if not self.is_supported: return self.audio_device = audio_device APIKEY = None URL = None with open(speakreader.CONFIG.IBM_CREDENTIALS_FILE) as f: for line in f.read().splitlines(): parm = line.split('=') if parm[0] == 'SPEECH_TO_TEXT_APIKEY': APIKEY = parm[1] if parm[0] == 'SPEECH_TO_TEXT_URL': URL = parm[1] if APIKEY is None or URL is None: logger.warn('ibmTranscribe: APIKEY or URL not found in credentials file') # initialize speech to text service self.authenticator = IAMAuthenticator(APIKEY) self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator) self.speech_to_text.set_service_url(URL) self.mycallback = ProcessResponses() self.audio_source = AudioSource(audio_device._streamBuff, is_recording=True, is_buffer=True) def transcribe(self): if not self.is_supported: return # Generator to return transcription results logger.debug('ibmTranscribe.transcribe ENTER') recognize_thread = Thread(target=self.recognize_using_websocket, args=()) recognize_thread.start() while True: response = self.mycallback.responseQueue.get() if response is None: break yield response self.audio_source.completed_recording() recognize_thread.join() logger.debug('ibmTranscribe.transcribe EXIT') # this function will initiate the recognize service and pass in the AudioSource def recognize_using_websocket(self, *args): logger.debug("ibmTransribe.recognize_using_websocket ENTER") self.speech_to_text.recognize_using_websocket( audio=self.audio_source, content_type='audio/l16; rate=%s' % self.audio_device._outputSampleRate, recognize_callback=self.mycallback, interim_results=True, max_alternatives=1, inactivity_timeout=-1, smart_formatting=True, word_alternatives_threshold=0.75, profanity_filter=bool(speakreader.CONFIG.ENABLE_CENSORSHIP), ) logger.debug("ibmTransribe.recognize_using_websocket EXIT")
from Queue import Queue, Full except ImportError: from queue import Queue, Full ############################################### #### Initalize queue to store the recordings ## ############################################### CHUNK = 1024 # Note: It will discard if the websocket client can't consumme fast enough # So, increase the max size as per your choice BUF_MAX_SIZE = CHUNK * 10 # Buffer to store audio q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) # Create an instance of AudioSource audio_source = AudioSource(q, True, True) config = Utils.readYaml("config.yaml") ############################################### #### Prepare Speech to Text Service ######## ############################################### # initialize speech to text service authenticator = IAMAuthenticator(config['watson']['API_KEY']) speech_to_text = SpeechToTextV1(authenticator=authenticator) speech_to_text.set_service_url(config['watson']['URL']) # define callback for the speech to text service class MyRecognizeCallback(RecognizeCallback):
def on_transcription(self, transcript): print(transcript) def on_connected(self): print('Connection was successful') def on_error(self, error): print('Error received: {}'.format(error)) def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) def on_listening(self): print('Service is listening') def on_hypothesis(self, hypothesis): print(hypothesis) def on_data(self, data): print(data) # Example using threads in a non-blocking way mycallback = MyRecognizeCallback() audio_file = open(join(dirname(__file__), '../resources/speech.wav'), 'rb') audio_source = AudioSource(audio_file) recognize_thread = threading.Thread(target=service.recognize_using_websocket, args=(audio_source, "audio/l16; rate=44100", mycallback)) recognize_thread.start()
def start_stt(): mic_photo_on = PhotoImage(file=r"images/mic-on-50.png") mic_on = Button(window, text="Mic", image=mic_photo_on, background="white", activebackground="white", border=0, command=end_stt) mic_on.place(x=900, y=148) # mic_off.grid() # import watson_tts # def quit(): # return # def stop_watson(): # return # window.bind('<Control-c>', stop_watson) def space_break(): keyboard = Controller() # keyboard.press(Key.pause) # keyboard.release(Key.pause) with keyboard.pressed(Key.control): keyboard.press('c') keyboard.release('c') window.bind('<space>', space_break) try: from Queue import Queue, Full except ImportError: from queue import Queue, Full ############################################### #### Initalize queue / the thing to store the audio recordings ## ############################################### # CHUNK = 1024 CHUNK = 1500 # *** if the websocket client isn't fast enough it will just discard # *** if that happens it said to try using a larger max size BUF_MAX_SIZE = CHUNK * 20 # Buffer to store audio q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK))) # Create an instance of AudioSource audio_source = AudioSource(q, True, True) ############################################### #### Prepare Speech to Text Service ######## ############################################### # initialize speech to text service speech_to_text = SpeechToTextV1( iam_apikey="ZAM8vwm2g3Dsnh1UPjOqyI-PloGvZ-PjSEAbjT_JHk1s", url="https://gateway-wdc.watsonplatform.net/speech-to-text/api") # define callback for the speech to text service class MyRecognizeCallback(RecognizeCallback): def __init__(self): RecognizeCallback.__init__(self) def on_transcription(self, transcript): print(transcript) # status = "Translating..." def on_connected(self): print('Connection was successful') # status = "Connected" def on_error(self, error): print('Error received: {}'.format(error)) # status = "Error" def on_inactivity_timeout(self, error): print('Inactivity timeout: {}'.format(error)) # status = "Timeout" def on_listening(self): print('Service is listening') # status = "Listening..." def on_hypothesis(self, hypothesis): print(hypothesis) # return def on_data(self, data): print(data) # text_translation = data # header.configure(text=text_translation) # return def on_close(self): print("Connection closed") # status = "Listening stopped" # this function will initiate the recognize service and pass in the AudioSource def recognize_using_weboscket(*args): mycallback = MyRecognizeCallback() speech_to_text.recognize_using_websocket( audio=audio_source, content_type='audio/l16; rate=44100', recognize_callback=mycallback, interim_results=True) ############################################### #### Prepare the for recording using Pyaudio ## ############################################### # Variables for recording the speech FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 # define callback for pyaudio to store the recording in queue def pyaudio_callback(in_data, frame_count, time_info, status): try: q.put(in_data) except Full: pass # discard return (None, pyaudio.paContinue) # instantiate pyaudio audio = pyaudio.PyAudio() # open stream using callback stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, stream_callback=pyaudio_callback, start=False) ######################################################################### #### Start the recording and start service to recognize the stream ###### ######################################################################### print("Enter CTRL+C to end recording...") stream.start_stream() try: recognize_thread = Thread(target=recognize_using_weboscket, args=()) recognize_thread.start() while True: pass except KeyboardInterrupt: # stop recording recognize_thread.stop() stream.stop_stream() stream.close() audio.terminate() audio_source.completed_recording()