def transcribe_many_parallel(args, filepaths): for index, filepath in filepaths: ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) p = Process(target=transcribe_file, args=(args, ds, filepath, index)) p.start() p.join() print('{}: Transcribed file {} of {} from "{}"'.format( time.strftime("%H:%M:%S", time.localtime()), index + 1, len(filepaths), filepath))
def create_deepspeech_model(args): ds = Model(args.model) if args.beam_width: ds.setBeamWidth(args.beam_width) if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds
def main(): ds = Model("model.pbmm") ds.enableExternalScorer("scorer.scorer") fin = wave.open("a.wav", 'rb') audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fs_orig = fin.getframerate() audio_length = fin.getnframes() * (1 / fs_orig) fin.close() ds.addHotWord("proves", -5000.0) #print("STT with Metadata:") #print(ds.sttWithMetadata(audio, 1)) print("\n\nSTT:") print(ds.stt(audio))
def load(model, scorer, verbose=True, beam_width="", lm_alpha="", lm_beta="", hot_words=""): """ Load models""" model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start if verbose==True: print('\nLoading model from files {}'.format(model), file=sys.stderr) print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: if verbose == True: print('Loading scorer from files {}'.format(scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start if verbose == True: print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: if verbose == True: print('Adding hot-words', file=sys.stderr) for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) return ds, desired_sample_rate
def __init__(self, ): print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start model_path = os.path.dirname(os.path.abspath(__file__)) ds = Model(os.path.join(model_path, args.model)) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) self.desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(os.path.join(model_path, args.scorer)) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) self.ds = ds
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def sample_rate(self): return self.model.sampleRate()
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument( '--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument( '--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument('--hot_words', type=str, help='Hot-words and their boosts.') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print( metadata_json_output( ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print(ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ FORMAT = pyaudio.paInt16 SAMPLE_RATE = 16000 CHANNELS = 1 BLOCKS_PER_SECOND = 50 def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) self.vad = webrtcvad.Vad(mode=3) self.sample_rate = self.SAMPLE_RATE self.buffer_queue = queue.Queue() def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def deep_stream(self): return self.model.createStream() def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30): """ Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ # audio = np.frombuffer(audio, np.int16) n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n
if scorer: print('Loading scorer from files {}'.format(scorer)) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end)) if lm_alpha and lm_beta: print("Set Scorer Alpha and Beta") ds.setScorerAlphaBeta(lm_alpha, lm_beta) if hot_words: print('Adding hot-words') for word_boost in hot_words.split(','): word, boost = word_boost.split(':') ds.addHotWord(word, float(boost)) def get_audios_list(audios_path, with_path=False): audios = [] if path.exists(audios_path): for mfile in listdir(audios_path): if with_path: mfile = join(audios_path, mfile) if '.wav' in mfile: audios.append(mfile) return audios audios_list = get_audios_list(audios_path, with_path=True)