image_loc = st.empty() while True: try: frame = webrtc_ctx.video_receiver.frames_queue.get(timeout=1) except queue.Empty: print("Queue is empty. Stop the loop.") webrtc_ctx.video_receiver.stop() break img = frame.to_ndarray(format="bgr24") img = PIL.Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) image_loc.image(img) WEBRTC_CLIENT_SETTINGS = ClientSettings( rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}, media_stream_constraints={"video": True, "audio": True}, ) if __name__ == "__main__": logging.basicConfig( format="[%(asctime)s] %(levelname)7s from %(name)s in %(filename)s:%(lineno)d: " "%(message)s", force=True, ) logger.setLevel(level=logging.DEBUG) st_webrtc_logger = logging.getLogger("streamlit_webrtc") st_webrtc_logger.setLevel(logging.DEBUG) main()
from bokeh.models import CustomJS from bokeh.models.widgets import Button from bokeh.models.widgets import ( Div, ) from gtts import gTTS from streamlit_bokeh_events import streamlit_bokeh_events from streamlit_webrtc import (ClientSettings, VideoTransformerBase, WebRtcMode, webrtc_streamer) from tensorflow.keras.models import load_model from tensorflow.keras.preprocessing.image import ImageDataGenerator st.set_page_config(layout="wide") HERE = Path(__file__).parent WEBRTC_CLIENT_SETTINGS = ClientSettings(media_stream_constraints={ "video": True, "audio": False }, ) @st.cache(allow_output_mutation=True) def update_slider(): return {"slide": 0} @st.cache def load_model_from_drive(): save_dest = Path('models') save_dest.mkdir(exist_ok=True) f_checkpoint = Path("models/asl_alphabet_9575.h5")
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, client_settings=ClientSettings( rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ), ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
def webcam_input(style_model_name): #st.header("Webcam Live Feed") WIDTH = st.sidebar.select_slider('QUALITY (May reduce the speed)', list(range(150, 501, 50))) class NeuralStyleTransferTransformer(VideoTransformerBase): _width = WIDTH _model_name = style_model_name _model = None def __init__(self) -> None: self._model_lock = threading.Lock() self._width = WIDTH self._update_model() def set_width(self, width): update_needed = self._width != width self._width = width if update_needed: self._update_model() def update_model_name(self, model_name): update_needed = self._model_name != model_name self._model_name = model_name if update_needed: self._update_model() def _update_model(self): style_model_path = style_models_dict[self._model_name] with self._model_lock: self._model = get_model_from_path(style_model_path) def transform(self, frame): image = frame.to_ndarray(format="bgr24") if self._model == None: return image orig_h, orig_w = image.shape[0:2] # cv2.resize used in a forked thread may cause memory leaks input = np.asarray( Image.fromarray(image).resize( (self._width, int(self._width * orig_h / orig_w)))) with self._model_lock: transferred = style_transfer(input, self._model) result = Image.fromarray((transferred * 255).astype(np.uint8)) return np.asarray(result.resize((orig_w, orig_h))) ctx = webrtc_streamer( client_settings=ClientSettings( rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": False }, ), video_transformer_factory=NeuralStyleTransferTransformer, key="neural-style-transfer", ) if ctx.video_transformer: ctx.video_transformer.set_width(WIDTH) ctx.video_transformer.update_model_name(style_model_name)
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, client_settings=ClientSettings( rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ), ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break