def transcribe_many_parallel(args, filepaths): for index, filepath in filepaths: ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) p = Process(target=transcribe_file, args=(args, ds, filepath, index)) p.start() p.join() print('{}: Transcribed file {} of {} from "{}"'.format( time.strftime("%H:%M:%S", time.localtime()), index + 1, len(filepaths), filepath))
def ModelInitiate(model_file_path, lm_file_path, lm_alpha, lm_beta, beam_width): model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) return model
def create_deepspeech_model(args): ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds
def run(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='DeepSpeech Server') parser.add_argument('--port', default=3337, type=int, help='Port to listen on') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument('--google_key', help="Google Speech-Recognition API key.") args = parser.parse_args() ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) ds.enableExternalScorer(args.scorer) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) handler_class = ReqHandlerFactory(ds, args.google_key) server_address = ('', args.port) httpd = HTTPServer(server_address, handler_class) logging.info('Starting httpd...\n') try: httpd.serve_forever() except KeyboardInterrupt: pass httpd.server_close() logging.info('Stopping httpd...\n')
def get_model(lang): ds_model = Model(DS_PARAM[lang]['model']) if DS_PARAM[lang].get('beam_width'): ds_model.setBeamWidth(DS_PARAM[lang]['beam_width']) if DS_PARAM[lang].get('scorer'): print('Loading scorer from files {}'.format(DS_PARAM[lang]['scorer']), file=sys.stderr) scorer_load_start = timer() ds_model.enableExternalScorer(DS_PARAM[lang]['scorer']) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if DS_PARAM[lang].get('lm_alpha') and DS_PARAM[lang].get('lm_beta'): ds_model.setScorerAlphaBeta(DS_PARAM[lang]['lm_alpha'], DS_PARAM[lang]['lm_beta']) return ds_model
def get_model(lang): ds_model = Model(DS_PARAM[lang]["model"]) if DS_PARAM[lang].get("beam_width"): ds_model.setBeamWidth(DS_PARAM[lang]["beam_width"]) if DS_PARAM[lang].get("scorer"): print( "Loading scorer from files {}".format(DS_PARAM[lang]["scorer"]), file=sys.stderr, ) scorer_load_start = timer() ds_model.enableExternalScorer(DS_PARAM[lang]["scorer"]) scorer_load_end = timer() - scorer_load_start print("Loaded scorer in {:.3}s.".format(scorer_load_end), file=sys.stderr) if DS_PARAM[lang].get("lm_alpha") and DS_PARAM[lang].get("lm_beta"): ds_model.setScorerAlphaBeta(DS_PARAM[lang]["lm_alpha"], DS_PARAM[lang]["lm_beta"]) return ds_model
def stt(model_path, audio, beam_width=None, scorer_path=None, lm_alpha=None, lm_beta=None, hot_words=None): ds = Model(model_path) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer_path: ds.enableExternalScorer(scorer_path) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) # TODO # if hot_words: # print('Adding hot-words', file=sys.stderr) # for w in hot_words: # ds.addHotWord(w, 6.2) fin = wave.open(audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.', file=sys.stderr) exit(1) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() print('Running inference.', file=sys.stderr) res = ds.sttWithMetadata(audio, 1) res = postprocess_metadata(res) return res
def predict_speech_to_text(stream_file): alpha = 0.85 beta = 1.75 beam_width = 500 # Initialize the model speech_model = Model(MODEL_PATH) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) #set beam width. A larger beam width value generates better results at the cost of decoding time. speech_model.setBeamWidth(beam_width) # setting the Scorer language model weight (alpha) and word insertion weight(beta) speech_model.setScorerAlphaBeta(alpha, beta) # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file) return speech_model.stt(audio)
def load_deepspeech_model(self): model = os.path.join(self.deepspeech_models_folder, "deepspeech-0.9.3-models.pbmm") scorer = os.path.join(self.deepspeech_models_folder, "deepspeech-0.9.3-models.scorer") lm_alpha = 0.93 lm_beta = 1.18 beam_width = 100 model_load_start = timer() deepspeech_model = Model(model) model_load_end = timer() - model_load_start logger.debug("Loaded model in %0.3fs." % (model_load_end)) scorer_load_start = timer() deepspeech_model.enableExternalScorer(scorer) deepspeech_model.setScorerAlphaBeta(lm_alpha, lm_beta) deepspeech_model.setBeamWidth(beam_width) scorer_load_end = timer() - scorer_load_start logger.debug("Loaded external scorer in %0.3fs." % (scorer_load_end)) return deepspeech_model
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""): """ Load models""" model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start if verbose==True: print('\nLoading model from files {}'.format(model), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: if verbose == True: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start if verbose == True: print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: if verbose == True: print('Adding hot-words', file=sys.stderr) for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds, desired_sample_rate
def MozillaSTT(audio_path): # TODO: handle different rates (not implemented) fin = wave.open(audio_path, 'rb') output = "" # print("SS") ds = Model(model_file_path) # print("SS") ds.enableExternalScorer(scorer_file_path) # print("SS") lm_alpha = 0.75 # ?? lm_beta = 1.85 desired_sample_rate = ds.sampleRate() ds.setScorerAlphaBeta(lm_alpha, lm_beta) fs_orig = fin.getframerate() # print("Desired Sampling Rate: %d", desired_sample_rate) if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. \ Resampling might produce erratic speech recognition.'.format( fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) # audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) # print(metadata_json_output(ds.sttWithMetadata(audio, 3))) # print(ds.stt(audio)) output += ds.stt(audio) output += '\n' output += metadata_json_output(ds.sttWithMetadata(audio, 3)) return output
def __init__(self, ): print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start model_path = os.path.dirname(os.path.abspath(__file__)) ds = Model(os.path.join(model_path, args.model)) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) self.desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(os.path.join(model_path, args.scorer)) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) self.ds = ds
def setup_model(model_path, scorer, beam_width): log("creating model {} with scorer {}...".format(model_path, scorer)) model = Model(model_path) if scorer.scorer is not None: model.enableExternalScorer(scorer.scorer) if scorer.lm_alpha is not None and scorer.lm_beta is not None: if model.setScorerAlphaBeta(scorer.lm_alpha, scorer.lm_beta) != 0: raise RuntimeError("Unable to set scorer parameters") if beam_width is not None: if model.setBeamWidth(beam_width) != 0: raise RuntimeError("Unable to set beam width") log("model is ready.") return model
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument( '--prediction_in', required=True, help='Path to the directory with sound files (mp3/ogg/wav) to analyze') parser.add_argument( '--prediction_out', required=True, help='Path to the directory for moving the processed sound files to') parser.add_argument( '--prediction_tmp', required=False, help= 'Path to the temp directory for storing the predictions initially before moving them to "--prediction_out"' ) parser.add_argument( '--continuous', action='store_true', help='Whether to continuously load test images and perform prediction', required=False, default=False) parser.add_argument( '--delete_input', action='store_true', help= 'Whether to delete the input files rather than move them to "--prediction_out" directory', required=False, default=False) parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument( '--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument( '--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument( '--normalize', required=False, action='store_true', help='Whether to apply standard amplitude normalization') parsed = parser.parse_args() print('Loading model from file {}'.format(parsed.model)) ds = Model(parsed.model) if parsed.beam_width: ds.setBeamWidth(parsed.beam_width) if parsed.scorer: print('Loading scorer from file {}'.format(parsed.scorer)) ds.enableExternalScorer(parsed.scorer) if parsed.lm_alpha and parsed.lm_beta: ds.setScorerAlphaBeta(parsed.lm_alpha, parsed.lm_beta) process(model=ds, prediction_in=parsed.prediction_in, prediction_out=parsed.prediction_out, prediction_tmp=parsed.prediction_tmp, continuous=parsed.continuous, delete_input=parsed.delete_input, json=parsed.json, candidate_transcripts=parsed.candidate_transcripts, normalize=parsed.normalize)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') args = parser.parse_args() # print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start # print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: # print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start # print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() # print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print("Translation: "+ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
from deepspeech import Model for i in range(5): ds = Model('/models/mozilla/deepspeech-0.7.3-models.pbmm') ds.enableExternalScorer('/models/mozilla/deepspeech-0.7.3-models.scorer') ds.setScorerAlphaBeta(0.75, 1.85) ds.__del__()
def record_voice_and_predict_text(self): """Records the speech and predicts its text """ #Recording the speech stream_file_name = 'AudioFile/speech_stream.wav' stream_format = pyaudio.paInt16 # Sampling size and format no_of_channels = 1 # Number of audio channels sampling_rate = 16000 # Sampling rate in Hertz frames_count = 1024 # Number of frames per buffer record_seconds = 5 stream = pyaudio.PyAudio() stream_data = stream.open(format=stream_format, channels=no_of_channels, rate=sampling_rate, input=True, frames_per_buffer=frames_count) frames = [ stream_data.read(frames_count) for i in range(0, int(sampling_rate / frames_count * record_seconds)) ] stream_data.stop_stream() stream_data.close() stream.terminate() wave_file = wave.open(stream_file_name, 'wb') wave_file.setnchannels(no_of_channels) wave_file.setsampwidth(stream.get_sample_size(stream_format)) wave_file.setframerate(sampling_rate) wave_file.writeframes(b''.join(frames)) wave_file.close() try: self.label_info.setText('Recording completed.') except: pass #Text prediction Part alpha = 0.75 beta = 1.85 beam_width = 500 # Initialize the model speech_model = Model(MODEL_PATH) # set beam width. A larger beam width value generates better results at the cost of decoding time. speech_model.setBeamWidth(beam_width) # Enable language scorer to improve the accuracy speech_model.enableExternalScorer(SCORER_PATH) # You can play with setting the model Beam Width, Scorer language model weight and word insertion weight # Set hyperparameters alpha and beta of the external scorer. # alpha: Language model weight. # beta: Word insertion weight speech_model.setScorerAlphaBeta(alpha, beta) # Use scipy to covert wav file into numpy array _, audio = wav.read(stream_file_name) text = speech_model.stt(audio) try: self.text_pred.setText(text) except: pass show_images(text)
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') # parser.add_argument('--version', action=VersionAction, # help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument('--hot_words', type=str, help='Hot-words and their boosts.') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word,boost = word_boost.split(':') ds.addHotWord(word,float(boost)) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print(ds.stt(audio)) test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou") [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens] # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
model_load_start = timer() ds = Model(model_path) model_load_end = timer() - model_load_start print('Loaded scorer in: ', model_load_end) desired_sample_rate = ds.sampleRate() if scorer: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) def speech(audio): fin = wave.open(audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig)
from deepspeech import Model import gradio as gr import numpy as np model_file_path = "deepspeech-0.8.2-models.pbmm" lm_file_path = "deepspeech-0.8.2-models.scorer" beam_width = 100 lm_alpha = 0.93 lm_beta = 1.18 model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def reformat_freq(sr, y): if sr not in ( 48000, 16000, ): # Deepspeech only supports 16k, (we convert 48k -> 16k) raise ValueError("Unsupported rate", sr) if sr == 48000: y = (((y / max(np.max(y), 1)) * 32767).reshape( (-1, 3)).mean(axis=1).astype("int16")) sr = 16000 return sr, y def transcribe(speech, stream): _, y = reformat_freq(*speech)