def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" # Compute WAV duration audio_data: bytes = bytes() with io.BytesIO(wav_bytes) as wav_buffer: with wave.open(wav_buffer) as wav_file: frames = wav_file.getnframes() rate = wav_file.getframerate() wav_duration = frames / float(rate) # Extract raw audio data # TODO do we need this? audio_data = wav_file.readframes(wav_file.getnframes()) # Process data as an entire utterance start_time = time.perf_counter() text, confidence = self._transcribe_wav(wav_bytes) transcribe_seconds = time.perf_counter() - start_time _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds) if text is not None: return Transcription( text=text, likelihood=confidence, transcribe_seconds=transcribe_seconds, wav_seconds=wav_duration ) return None
def transcribe_wav(self, wav_data: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" self.load_decoder() assert self.decoder _LOGGER.debug("Decoding %s byte(s)", len(wav_data)) start_time = time.perf_counter() with io.BytesIO(wav_data) as wav_buffer: with wave.open(wav_buffer, "rb") as wav_file: sample_rate = wav_file.getframerate() num_frames = wav_file.getnframes() wav_duration = num_frames / float(sample_rate) frames = wav_file.readframes(num_frames) samples = struct.unpack_from("<%dh" % num_frames, frames) # Decode success = self.decoder.decode( sample_rate, np.array(samples, dtype=np.float32), True) if success: text, likelihood = self.decoder.get_decoded_string() transcribe_seconds = time.perf_counter() - start_time return Transcription( text=text.strip(), likelihood=likelihood, transcribe_seconds=transcribe_seconds, wav_seconds=wav_duration, ) # Failure return None
def transcribe_stream( self, audio_stream: typing.Iterable[bytes], sample_rate: int, sample_width: int, channels: int, ) -> typing.Optional[Transcription]: total_frames = 0 start_time = time.perf_counter() wav_frames = bytearray() for frame in audio_stream: wav_frames.extend(frame) total_frames += 1 wav_bytes = bytes(wav_frames) text, confidence = self._transcribe_wav(wav_bytes) transcribe_seconds = time.perf_counter() - start_time _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds) if text is not None: return Transcription( text=text, likelihood=confidence, transcribe_seconds=transcribe_seconds, wav_seconds=total_frames / float(sample_rate) ) return None
def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" start_time = time.perf_counter() with tempfile.NamedTemporaryFile(suffix=".wav", mode="wb") as wav_file: wav_file.write(wav_bytes) wav_file.seek(0) if self.model_type == KaldiModelType.NNET3: text = self._transcribe_wav_nnet3(wav_file.name) elif self.model_type == KaldiModelType.GMM: text = self._transcribe_wav_gmm(wav_file.name) else: raise ValueError(self.model_type) if text: # Success end_time = time.perf_counter() return Transcription( text=text.strip(), likelihood=1, transcribe_seconds=(end_time - start_time), wav_seconds=get_wav_duration(wav_bytes), ) # Failure return None
async def async_test_silence(self): """Check start/stop session with silence detection.""" fake_transcription = Transcription( text="turn on the living room lamp", likelihood=1, transcribe_seconds=0, wav_seconds=0, ) def fake_transcribe(stream, *args): """Return test trancription.""" for chunk in stream: if not chunk: break return fake_transcription self.transcriber.transcribe_stream = fake_transcribe # Start session start_listening = AsrStartListening( site_id=self.site_id, session_id=self.session_id, stop_on_silence=True, send_audio_captured=False, ) result = None async for response in self.hermes.on_message_blocking(start_listening): result = response # No response expected self.assertIsNone(result) # Send in "audio" wav_path = Path("etc/turn_on_the_living_room_lamp.wav") results = [] with open(wav_path, "rb") as wav_file: for wav_bytes in AudioFrame.iter_wav_chunked(wav_file, 4096): frame = AudioFrame(wav_bytes=wav_bytes) async for response in self.hermes.on_message_blocking( frame, site_id=self.site_id ): results.append(response) # Except transcription self.assertEqual( results, [ AsrRecordingFinished(site_id=self.site_id, session_id=self.session_id), AsrTextCaptured( text=fake_transcription.text, likelihood=fake_transcription.likelihood, seconds=fake_transcription.transcribe_seconds, site_id=self.site_id, session_id=self.session_id, ), ], )
def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" self.maybe_load_model() assert self.model, "Model was not loaded" start_time = time.perf_counter() # Convert to raw numpy buffer with io.BytesIO(wav_bytes) as wav_io: with wave.open(wav_io) as wav_file: audio_bytes = wav_file.readframes(wav_file.getnframes()) audio_buffer = np.frombuffer(audio_bytes, np.int16) metadata = self.model.sttWithMetadata(audio_buffer) end_time = time.perf_counter() if metadata: # Actual transcription text = "" # Individual tokens tokens: typing.List[TranscriptionToken] = [] word = "" word_start_time = 0 for index, item in enumerate(metadata.items): text += item.character if item.character != " ": # Add to current word word += item.character if item.character == " " or (index == (len(metadata.items) - 1)): # Combine into single tokens tokens.append( TranscriptionToken( token=word, likelihood=1, start_time=word_start_time, end_time=item.start_time, )) # Word break word = "" word_start_time = 0 elif len(word) > 1: word_start_time = item.start_time return Transcription( text=text, likelihood=metadata.confidence, transcribe_seconds=(end_time - start_time), wav_seconds=get_wav_duration(wav_bytes), tokens=tokens, ) # Failure return None
def transcribe_stream( self, audio_stream: typing.Iterable[bytes], sample_rate: int, sample_width: int, channels: int, ) -> typing.Optional[Transcription]: """Speech to text from an audio stream.""" assert channels == 1, "Only mono audio supported" self.load_decoder() assert self.decoder start_time = time.perf_counter() last_chunk: typing.Optional[bytes] = None audio_iter = iter(audio_stream) total_frames: int = 0 while True: try: next_chunk = next(audio_iter) if last_chunk: # Don't finalize num_frames = len(last_chunk) // sample_width total_frames += num_frames samples = struct.unpack_from("<%dh" % num_frames, last_chunk) self.decoder.decode(sample_rate, np.array(samples, dtype=np.float32), False) last_chunk = next_chunk except StopIteration: break if not last_chunk: # Add one empty frame for finalization last_chunk = bytes([0] * sample_width) # Finalize num_frames = len(last_chunk) // sample_width total_frames += num_frames samples = struct.unpack_from("<%dh" % num_frames, last_chunk) success = self.decoder.decode(sample_rate, np.array(samples, dtype=np.float32), True) if success: text, likelihood = self.decoder.get_decoded_string() transcribe_seconds = time.perf_counter() - start_time return Transcription( text=text.strip(), likelihood=likelihood, transcribe_seconds=transcribe_seconds, wav_seconds=total_frames / float(sample_rate), ) # Failure return None
def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]: """Transcribe WAV data.""" if not self.julius_proc: self.start_julius() assert self.julius_in and self.julius_out, "Julius not started" # Compute WAV duration wav_duration = get_wav_duration(wav_bytes) # Write path to WAV file _LOGGER.debug("Sending %s byte(s) to Julius", len(wav_bytes)) start_time = time.time() with tempfile.NamedTemporaryFile(suffix=".wav", mode="wb+") as temp_file: temp_file.write(wav_bytes) temp_file.seek(0) print(temp_file.name, file=self.julius_out) self.julius_out.flush() sentence_line = "" line = self.julius_in.readline().strip() _LOGGER.debug("Julius> %s", line) while True: if line.startswith("sentence1:"): sentence_line = line.split(":", maxsplit=1)[1] break if "error" in line.lower(): # Give up with an empty transcription _LOGGER.warning(line) break line = self.julius_in.readline().strip() _LOGGER.debug("Julius> %s", line) # Exclude <s> and </s> _LOGGER.debug(sentence_line) result_text = sentence_line.replace("<s>", "").replace("</s>", "").strip() end_time = time.time() result_text = result_text.strip() return Transcription( text=result_text, transcribe_seconds=end_time - start_time, wav_seconds=wav_duration, likelihood=1, )
def metadata_to_transcription( metadata: typing.Optional[deepspeech.Metadata], wav_seconds: float, transcribe_seconds: float, ) -> typing.Optional[Transcription]: """Convert DeepSpeech metadata to Rhasspy Transcription""" if metadata: # Actual transcription text = "" # Individual tokens tokens: typing.List[TranscriptionToken] = [] confidence = 1.0 if metadata.transcripts: transcript = next(iter(metadata.transcripts)) confidence = math.exp(transcript.confidence) words_and_tokens: typing.List[typing.Any] = [["", []]] # Organize by whitespace-separated words for token in transcript.tokens: text += token.text if token.text.strip(): # Part of a word words_and_tokens[-1][0] += token.text words_and_tokens[-1][1].append(token) else: # Whitespace words_and_tokens.append(["", []]) for word, word_tokens in words_and_tokens: if not (word and word_tokens): continue tokens.append( TranscriptionToken( token=word, likelihood=1, start_time=word_tokens[0].start_time, end_time=word_tokens[-1].start_time, )) return Transcription( text=text, likelihood=confidence, transcribe_seconds=transcribe_seconds, wav_seconds=wav_seconds, tokens=tokens, ) # Failure return None
def transcribe_wav(self, wav_data: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" kaldi_cmd = [ "kaldi-decode", "--model-type", str(self.model_type), "--model-dir", str(self.model_dir), "--graph-dir", str(self.graph_dir), ] _LOGGER.debug(kaldi_cmd) with tempfile.NamedTemporaryFile(suffix=".wav", mode="wb") as temp_file: temp_file.write(wav_data) # Rewind temp_file.seek(0) kaldi_proc = subprocess.Popen( kaldi_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True, ) # Write path to WAV file print(temp_file.name, file=kaldi_proc.stdin) # Get result back as JSON result_json, _ = kaldi_proc.communicate() _LOGGER.debug(result_json) result = json.loads(result_json) # Empty string indicates failure text = str(result.get("text", "")) if text: # Success return Transcription( text=text.strip(), likelihood=float(result.get("likelihood", 0)), transcribe_seconds=float( result.get("transcribe_seconds", 0)), wav_seconds=float(result.get("wav_seconds", 0)), ) # Failure return None
def transcribe_proc(): """Transcribe live audio stream indefinitely.""" while True: # Get result of transcription transcribe_result = transcriber.transcribe_stream( audio_stream(), sample_rate, sample_width, channels) _LOGGER.debug("Transcription result: %s", transcribe_result) transcribe_result = transcribe_result or Transcription.empty() transcribe_dict = dataclasses.asdict(transcribe_result) transcribe_dict["timeout"] = is_timeout print_json(transcribe_dict)
def transcribe_stream( self, audio_stream: typing.Iterable[bytes], sample_rate: int, sample_width: int, channels: int, ) -> typing.Optional[Transcription]: """Speech to text from an audio stream.""" assert channels == 1, "Only mono audio supported" if self.decoder is None: # Load decoder self.decoder = self.get_decoder() total_frames = 0 # Process data as an entire utterance start_time = time.perf_counter() self.decoder.start_utt() for frame in audio_stream: self.decoder.process_raw(frame, False, False) total_frames += 1 self.decoder.end_utt() transcribe_seconds = time.perf_counter() - start_time _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds) hyp = self.decoder.hyp() if hyp: return Transcription( text=hyp.hypstr.strip(), likelihood=self.decoder.get_logmath().exp(hyp.prob), transcribe_seconds=transcribe_seconds, wav_seconds=total_frames / float(sample_rate), tokens=[ TranscriptionToken( token=seg.word, start_time=seg.start_frame / 100, end_time=seg.end_frame / 100, likelihood=self.decoder.get_logmath().exp(seg.prob), ) for seg in self.decoder.seg() ], ) return None
def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]: """Speech to text from WAV data.""" if self.decoder is None: # Load decoder self.decoder = self.get_decoder() # Compute WAV duration audio_data: bytes = bytes() with io.BytesIO(wav_bytes) as wav_buffer: with wave.open(wav_buffer) as wav_file: frames = wav_file.getnframes() rate = wav_file.getframerate() wav_duration = frames / float(rate) # Extract raw audio data audio_data = wav_file.readframes(wav_file.getnframes()) # Process data as an entire utterance start_time = time.perf_counter() self.decoder.start_utt() self.decoder.process_raw(audio_data, False, True) self.decoder.end_utt() transcribe_seconds = time.perf_counter() - start_time _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds) hyp = self.decoder.hyp() if hyp: return Transcription( text=hyp.hypstr.strip(), likelihood=self.decoder.get_logmath().exp(hyp.prob), transcribe_seconds=transcribe_seconds, wav_seconds=wav_duration, tokens=[ TranscriptionToken( token=seg.word, start_time=seg.start_frame / 100, end_time=seg.end_frame / 100, likelihood=self.decoder.get_logmath().exp(seg.prob), ) for seg in self.decoder.seg() ], ) return None
def transcribe(args: argparse.Namespace): """Do speech to text on one more WAV files.""" # Load transcriber args.model_dir = Path(args.model_dir) if args.graph_dir: args.graph_dir = Path(args.graph_dir) else: args.graph_dir = args.model_dir / "graph" transcriber = KaldiCommandLineTranscriber( args.model_type, args.model_dir, args.graph_dir ) # Do transcription try: if args.wav_file: # Transcribe WAV files for wav_path in args.wav_file: _LOGGER.debug("Processing %s", wav_path) wav_bytes = open(wav_path, "rb").read() result = transcriber.transcribe_wav(wav_bytes) if not result: result = Transcription.empty() print_json(result) else: # Read WAV data from stdin if os.isatty(sys.stdin.fileno()): print("Reading WAV data from stdin...", file=sys.stderr) # Stream in chunks with wave.open(sys.stdin.buffer, "rb") as wav_file: def audio_stream(wav_file, frames_in_chunk): num_frames = wav_file.getnframes() try: while num_frames > frames_in_chunk: yield wav_file.readframes(frames_in_chunk) num_frames -= frames_in_chunk if num_frames > 0: # Last chunk yield wav_file.readframes(num_frames) except KeyboardInterrupt: pass result = transcriber.transcribe_stream( audio_stream(wav_file, args.frames_in_chunk), wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), ) assert result print_json(result) except KeyboardInterrupt: pass finally: transcriber.stop()
def transcribe_stream( self, audio_stream: typing.Iterable[bytes], sample_rate: int, sample_width: int, channels: int, ) -> typing.Optional[Transcription]: """Speech to text from an audio stream.""" if self.model_type == KaldiModelType.NNET3: # Use online2-tcp-nnet3-decode-faster if not self.decode_proc: self.start_decode() # Connect to decoder client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client_socket.settimeout(self.timeout_seconds) client_socket.connect(("localhost", self.port_num)) client_file = client_socket.makefile(mode="rb") start_time = time.perf_counter() num_frames = 0 for chunk in audio_stream: if chunk: client_socket.sendall(chunk) num_frames += len(chunk) // sample_width # Partial shutdown of socket (write only). # This should force the Kaldi server to finalize the output. client_socket.shutdown(socket.SHUT_WR) _LOGGER.debug("Finished stream. Getting transcription.") lines = client_file.read().decode().splitlines() text = "" _LOGGER.debug(lines) if lines: # Find longest line for line in reversed(lines): line = line.strip() if len(line) > len(text): text = line else: # No result text = "" if text: # Success end_time = time.perf_counter() return Transcription( text=text, likelihood=1, transcribe_seconds=(end_time - start_time), wav_seconds=(num_frames / sample_rate), ) # Failure return None if self.model_type == KaldiModelType.GMM: # No online streaming support. # Re-package as a WAV. with io.BytesIO() as wav_buffer: wav_file: wave.Wave_write = wave.open(wav_buffer, "wb") with wav_file: wav_file.setframerate(sample_rate) wav_file.setsampwidth(sample_width) wav_file.setnchannels(channels) for frame in audio_stream: wav_file.writeframes(frame) return self.transcribe_wav(wav_buffer.getvalue()) raise ValueError(f"Unsupported model type: {self.model_type}")
def transcribe_proc(info, transcriber_factory, sample_rate, sample_width, channels): def audio_stream(frame_queue) -> typing.Iterable[bytes]: # Pull frames from the queue frames = frame_queue.get() while frames: yield frames frames = frame_queue.get() try: info.transcriber = transcriber_factory( port_num=self.kaldi_port) assert (info.transcriber is not None), "Failed to create transcriber" while True: # Wait for session to start info.ready_event.wait() info.ready_event.clear() # Get result of transcription result = info.transcriber.transcribe_stream( audio_stream(info.frame_queue), sample_rate, sample_width, channels, ) _LOGGER.debug("Transcription result: %s", result) assert (result is not None and result.text), "Null transcription" # Signal completion info.result = result info.result_event.set() if not self.reuse_transcribers: try: info.transcriber.stop() except Exception: _LOGGER.exception("Transcriber stop") break except Exception: _LOGGER.exception("session proc") # Mark as not reusable info.reuse = False # Stop transcriber if info.transcriber is not None: try: info.transcriber.stop() except Exception: _LOGGER.exception("Transcriber stop") # Signal failure info.transcriber = None info.result = Transcription(text="", likelihood=0, transcribe_seconds=0, wav_seconds=0) info.result_event.set()
async def transcribe_wav(args: argparse.Namespace, core: Voice2JsonCore) -> None: """Speech to text from WAV file(s).""" from rhasspyasr import Transcription # Make sure profile has been trained assert core.check_trained(), "Not trained" # Get speech to text transcriber for profile transcriber = core.get_transcriber(open_transcription=args.open, debug=args.debug) # Directory to report WAV file names relative to relative_dir = (None if args.relative_directory is None else Path( args.relative_directory)) try: if args.wav_file or args.stdin_files: # Read WAV file paths wav_files = args.wav_file if args.stdin_files: _LOGGER.debug("Reading file paths from stdin") wav_files = itertools.chain(wav_files, sys.stdin) for wav_path_str in wav_files: wav_path_str = wav_path_str.strip() # Load and convert wav_path = Path(wav_path_str) _LOGGER.debug("Transcribing %s", wav_path) wav_data = await core.maybe_convert_wav(wav_path.read_bytes()) # Transcribe transcription = (transcriber.transcribe_wav(wav_data) or Transcription.empty()) result = dataclasses.asdict(transcription) if relative_dir is None: # Add name of WAV file to result result["wav_name"] = wav_path.name else: # Make relative to some directory result["wav_name"] = str(wav_path.absolute().relative_to( relative_dir.absolute())) print_json(result) else: # Read WAV data from stdin _LOGGER.debug("Reading WAV data from stdin") if args.input_size: # Number of bytes is on separate line line = sys.stdin.buffer.readline().strip() if not line: return num_bytes = int(line) while num_bytes > 0: # Read in WAV wav_data = sys.stdin.buffer.read(num_bytes) while len(wav_data) < num_bytes: wav_data = sys.stdin.buffer.read(num_bytes - len(wav_data)) # Transcribe wav_data = await core.maybe_convert_wav(wav_data) transcription = (transcriber.transcribe_wav(wav_data) or Transcription.empty()) result = dataclasses.asdict(transcription) print_json(result) # Next WAV line = sys.stdin.buffer.readline().strip() if not line: break num_bytes = int(line) else: # Load and convert entire input wav_data = await core.maybe_convert_wav( sys.stdin.buffer.read()) # Transcribe transcription = (transcriber.transcribe_wav(wav_data) or Transcription.empty()) result = dataclasses.asdict(transcription) print_json(result) finally: transcriber.stop()
async def async_test_session(self): """Check good start/stop session.""" fake_transcription = Transcription( text="this is a test", likelihood=1, transcribe_seconds=0, wav_seconds=0 ) def fake_transcribe(stream, *args): """Return test trancription.""" for chunk in stream: if not chunk: break return fake_transcription self.transcriber.transcribe_stream = fake_transcribe # Start session start_listening = AsrStartListening( site_id=self.site_id, session_id=self.session_id, stop_on_silence=False, send_audio_captured=True, ) result = None async for response in self.hermes.on_message_blocking(start_listening): result = response # No response expected self.assertIsNone(result) # Send in "audio" fake_wav_bytes = self.hermes.to_wav_bytes(secrets.token_bytes(100)) fake_frame = AudioFrame(wav_bytes=fake_wav_bytes) async for response in self.hermes.on_message_blocking( fake_frame, site_id=self.site_id ): result = response # No response expected self.assertIsNone(result) # Stop session stop_listening = AsrStopListening( site_id=self.site_id, session_id=self.session_id ) results = [] async for response in self.hermes.on_message_blocking(stop_listening): results.append(response) # Check results self.assertEqual( results, [ AsrRecordingFinished(site_id=self.site_id, session_id=self.session_id), AsrTextCaptured( text=fake_transcription.text, likelihood=fake_transcription.likelihood, seconds=fake_transcription.transcribe_seconds, site_id=self.site_id, session_id=self.session_id, ), ( AsrAudioCaptured(wav_bytes=fake_wav_bytes), {"site_id": self.site_id, "session_id": self.session_id}, ), ], )
def transcribe_stream( self, audio_stream: typing.Iterable[bytes], sample_rate: int, sample_width: int, channels: int, ) -> typing.Optional[Transcription]: """Speech to text from an audio stream.""" if self.model_type == KaldiModelType.NNET3: # Use online2-tcp-nnet3-decode-faster if not self.decode_proc: self.start_decode() assert self.decode_proc, "No decode process" start_time = time.perf_counter() num_frames = 0 for chunk in audio_stream: if chunk: num_samples = len(chunk) // sample_width # Write sample count to process stdin print(num_samples, file=self.decode_proc.stdin) self.decode_proc.stdin.flush() # Write chunk to FIFO. # Make sure that we write exactly the right number of bytes. self.chunk_fifo_file.write(chunk[:num_samples * sample_width]) self.chunk_fifo_file.flush() num_frames += num_samples # Finish utterance print("0", file=self.decode_proc.stdin) self.decode_proc.stdin.flush() _LOGGER.debug("Finished stream. Getting transcription.") for line in self.decode_proc.stdout: line = line.strip() if line.lower() == "ready": continue confidence_and_text = line break _LOGGER.debug(confidence_and_text) if confidence_and_text: # Success end_time = time.perf_counter() # <mbr_wer> <word> <word_confidence> <word_start_time> <word_end_time> ... wer_str, *words = confidence_and_text.split() confidence = 0.0 try: # Try to parse minimum bayes risk (MBR) word error rate (WER) confidence = max(0, 1 - float(wer_str)) except ValueError: _LOGGER.exception(wer_str) tokens = [] for word, word_confidence, word_start_time, word_end_time in grouper( words, n=4): tokens.append( TranscriptionToken( token=word, start_time=float(word_start_time), end_time=float(word_end_time), likelihood=float(word_confidence), )) text = " ".join(t.token for t in tokens) return Transcription( text=text, likelihood=confidence, transcribe_seconds=(end_time - start_time), wav_seconds=(num_frames / sample_rate), tokens=tokens, ) # Failure return None if self.model_type == KaldiModelType.GMM: # No online streaming support. # Re-package as a WAV. with io.BytesIO() as wav_buffer: wav_file: wave.Wave_write = wave.open(wav_buffer, "wb") with wav_file: wav_file.setframerate(sample_rate) wav_file.setsampwidth(sample_width) wav_file.setnchannels(channels) for frame in audio_stream: wav_file.writeframes(frame) return self.transcribe_wav(wav_buffer.getvalue()) raise ValueError(f"Unsupported model type: {self.model_type}")