def audio_params_gen(audio_chunks, start): for chunk in audio_chunks[:-1]: yield config(len(chunk)), RecognitionAudio(content=chunk) time.sleep(chunk_size) start[0] = time.time() yield config(len(audio_chunks[-1])), RecognitionAudio( content=audio_chunks[-1])
def transcribe_audio(audio_stream, model: str, language_code: str): """ Transcribe the given audio chunks """ global client encoding = RecognitionConfig.AudioEncoding.LINEAR16 try: audio = RecognitionAudio(content=audio_stream) config = RecognitionConfig( sample_rate_hertz=SR, encoding=encoding, language_code=language_code, max_alternatives=10, model=model, ) response = client.recognize(config, audio, uuid="", timeout=1000) except Exception as e: print(f"error: {str(e)}") return [] return parse_response(response)
def transcribe_audio(audio_stream, model: str, language_code: str, sample_rate=8000, max_alternatives=10, raw: bool = False): """ Transcribe the given audio chunks """ global client try: audio = RecognitionAudio(content=audio_stream) config = RecognitionConfig(sample_rate_hertz=sample_rate, encoding=ENCODING, language_code=language_code, max_alternatives=max_alternatives, model=model, raw=raw, data_bytes=len(audio_stream)) response = client.recognize(config, audio, uuid=str(random.randint(1000, 100000)), timeout=1000) except Exception as e: print(f"error: {str(e)}") return [] return parse_response(response)
def transcribe_chunks(audio_chunks, model: str, language_code: str, raw: bool = False): """ Transcribe the given audio chunks """ global client response = {} encoding = RecognitionConfig.AudioEncoding.LINEAR16 try: if raw: config = lambda chunk_len: RecognitionConfig(sample_rate_hertz=SR, encoding=encoding, language_code= language_code, max_alternatives=10, model=model, raw=True, data_bytes=chunk_len) audio_params = [(config(len(chunk)), RecognitionAudio(content=chunk)) for chunk in audio_chunks] response = client.streaming_recognize_raw(audio_params, uuid="", timeout=1000) else: audio = (RecognitionAudio(content=chunk) for chunk in audio_chunks) config = RecognitionConfig( sample_rate_hertz=SR, encoding=encoding, language_code=language_code, max_alternatives=10, model=model, ) response = client.streaming_recognize(config, audio, uuid="", timeout=1000) except Exception as e: traceback.print_exc() print(f'error: {str(e)}') return None return parse_response(response)
def transcribe_chunks_streaming(client, audio_chunks, model: str, language_code: str, sample_rate=8000, max_alternatives=10, raw: bool = False, word_level: bool = False, chunk_size: float = 0.5): """ Transcribe the given audio chunks """ response = {} try: if raw: config = lambda chunk_len: RecognitionConfig( sample_rate_hertz=sample_rate, encoding=ENCODING, language_code=language_code, max_alternatives=max_alternatives, model=model, raw=True, word_level=word_level, data_bytes=chunk_len) start = [None] def audio_params_gen(audio_chunks, start): for chunk in audio_chunks[:-1]: yield config(len(chunk)), RecognitionAudio(content=chunk) time.sleep(chunk_size) start[0] = time.time() yield config(len(audio_chunks[-1])), RecognitionAudio( content=audio_chunks[-1]) response = client.streaming_recognize_raw( audio_params_gen(audio_chunks, start), uuid=str(random.randint(1000, 100000))) end = time.time() print(f"{((end - start[0])*1000):.2f}ms") else: audio = (RecognitionAudio(content=chunk) for chunk in audio_chunks) config = RecognitionConfig(sample_rate_hertz=sample_rate, encoding=ENCODING, language_code=language_code, max_alternatives=max_alternatives, model=model, word_level=word_level) response = client.streaming_recognize( config, audio, uuid=str(random.randint(1000, 100000))) except Exception as e: traceback.print_exc() print(f'error: {str(e)}') pprint(parse_response(response))
def transcribe_chunks_streaming(client, audio_chunks, model: str, language_code: str, sample_rate=8000, max_alternatives=10, raw: bool = False, word_level: bool = False): """ Transcribe the given audio chunks """ response = {} try: if raw: config = lambda chunk_len: RecognitionConfig( sample_rate_hertz=sample_rate, encoding=ENCODING, language_code=language_code, max_alternatives=max_alternatives, model=model, raw=True, word_level=word_level, data_bytes=chunk_len) audio_params = [(config(len(chunk)), RecognitionAudio(content=chunk)) for chunk in audio_chunks] response = client.streaming_recognize_raw(audio_params, uuid="") else: audio = (RecognitionAudio(content=chunk) for chunk in audio_chunks) config = RecognitionConfig(sample_rate_hertz=sample_rate, encoding=ENCODING, language_code=language_code, max_alternatives=max_alternatives, model=model, word_level=word_level) response = client.streaming_recognize(config, audio, uuid="") except Exception as e: traceback.print_exc() print(f'error: {str(e)}') pprint(parse_response(response))
def ktranscribe(): global KS_CLIENT audio_chunks = [(ffmpeg.input(FILE).output("-", format="wav", acodec="pcm_s16le", ac=1, ar="8k", t="30").overwrite_output().run( capture_stdout=True, quiet=True))[0]] audio = (RecognitionAudio(content=chunk) for chunk in audio_chunks) return KS_CLIENT.streaming_recognize(CONFIG, audio, uuid="")
def decode_audio(self, index: int): # NOTE: These are only assumptions for now so test failures might not # necessarily mean error in model/server. config = RecognitionConfig( sample_rate_hertz=8000, encoding=RecognitionConfig.AudioEncoding.LINEAR16, language_code="hi", max_alternatives=10, model="general") audio = dreamer((RecognitionAudio(content=chunk) for chunk in self.audios[index][0]), 1) self.results[index] = self.client.streaming_recognize(config, audio, uuid="")
def audio_chunks_gen(audio_chunks): for chunk in audio_chunks: yield RecognitionAudio(content=chunk)
def audio_params_gen(audio_chunks): for chunk in audio_chunks: yield config(len(chunk)), RecognitionAudio(content=chunk)