def recognize(model="../models/output_graph.pb", audio="../audio/2830-3980-0043.wav", alphabet="../models/alphabet.txt", lm="../models/lm.binary", trie="../models/trie"): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() result = ds.stt(audio, fs) print(result, file=sys.stderr) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return result
class transciber(object): def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500): print('Loading model from file %s' % modelPath, file=sys.stderr) model_load_start = timer() self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth) self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) def transcribe(self, audioPath): fs, audio = wav.read(audioPath) audio_length = len(audio) * (1 / 16000) label = self.model.stt(audio, fs) print(label) return label
class DeepSpeech: def __init__(self, model, alphabet, lm=None, trie=None): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm is not None and trie is not None: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def stt(self, audio_file): fs, audio = wav.read(audio_file) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() stt_result = self.ds.stt(audio, fs) print('Return result: ', stt_result) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return stt_result
class DeepSpeechImp: ds = None def __init__(self): logging.info('Loading model from file %s' % (shared_params.DS_MODEL)) model_load_start = timer() self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT, shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH) model_load_end = timer() - model_load_start logging.info('Loaded model in %0.3fs.' % (model_load_end)) logging.info('Loading language model from files %s %s' % (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE)) lm_load_start = timer() self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET, shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE, shared_params.LM_WEIGHT, shared_params.WORD_COUNT_WEIGHT, shared_params.VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start logging.info('Loaded language model in %0.3fs.' % (lm_load_end)) def process_audio(self, audio_path): try: fs, audio = wav.read(audio_path) return self.ds.stt(audio, fs) except Exception as ex: logging.error(str(ex)) return "" def __del__(self): del self.ds
def _worker_thread(self): print('restoring from {}'.format(model_file)) model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH, BEAM_WIDTH) model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH, LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) while True: cmd, *args = self._queue.get() if cmd == 'sample': sample = args[0] file = wave.open(sample.wav_path) audio = np.frombuffer(file.readframes(file.getnframes()), dtype=np.int16) fs = file.getframerate() start = time.time() result = model.stt(audio, fs) inference_time = time.time() - start wav_time = wav_length(sample.wav_path) print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format( wav_time, inference_time, inference_time / wav_time)) self.inference_done.emit(sample, result) elif cmd == 'stop': break sess.close()
def main_deepspeech(args): args = parse_args_deep() if args is None else args print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wave.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class SpeechRecognizer: def __init__(self): self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH, TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) def speech_to_text(self, audio_buffer, sample_rate): app.logger.info('processing audio file') audio = self._process_audio_data(audio_buffer, sample_rate) app.logger.info('starting recognition') start = time() text = self._model.stt(audio, SAMPLE_RATE) end = time() app.logger.info('finished in {:.3f}s'.format(end - start)) return text def _process_audio_data(self, audio_buffer, original_sample_rate): audio = np.frombuffer(audio_buffer, dtype=np.int16) if original_sample_rate != SAMPLE_RATE: audio = self._resample(audio, original_sample_rate) return audio def _resample(self, audio, original_sample_rate): audio_length = len(audio) / original_sample_rate samples = int(audio_length * SAMPLE_RATE) return signal.resample(audio, samples).astype(np.int16)
def main(): model = "models/output_graph.pb" alphabet = "models/alphabet.txt" lm = "models/lm.binary" trie = "models/trie" ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) with open("flickr_audio_transcription.txt", "w") as out: for audio_f in glob.glob( "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"): print("Transcribing {}".format(audio_f)) try: fs, audio = wav.read(audio_f) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) basename, ext = os.path.splitext(os.path.basename(audio_f)) out.write("{}\t{}\n".format(basename, ds.stt(audio, fs))) out.flush() except ValueError as e: print("Error: {}".format(e))
def load_model(): args = { 'model': './models/output_graph.pb', 'alphabet': './models/alphabet.txt', 'lm': './models/lm.binary', 'trie': './models/trie', 'audio': './sample_input.wav' } print('Loading model from file {}'.format(args['model']), file=sys.stderr) model_load_start = timer() ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args['lm'] and args['trie']: print('Loading language model from files {} {}'.format( args['lm'], args['trie']), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], aLMWeight=LM_WEIGHT, aValidWordCountWeight=VALID_WORD_COUNT_WEIGHT, aWordCountWeight=WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) return ds
def build_model(init_settings): """ :param init_settings: Configparser :return: """ print('Loading DeepSpeech Models') try: ds = Model(str(init_settings['deepspeech']['model_path']), int(init_settings['deepspeech']['N_FEATURES']), int(init_settings['deepspeech']['N_CONTEXT']), str(init_settings['deepspeech']['alphabet_path']), int(init_settings['deepspeech']['BEAM_WIDTH'])) ds.enableDecoderWithLM( str(init_settings['deepspeech']['alphabet_path']), str(init_settings['deepspeech']['lm_path']), str(init_settings['deepspeech']['trie_path']), float(init_settings['deepspeech']['LM_WEIGHT']), float(init_settings['deepspeech']['WORD_COUNT_WEIGHT']), float(init_settings['deepspeech']['VALID_WORD_COUNT_WEIGHT'])) return ds except Exception as e: print('Loading Error!') print(e) return None
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)', default="models/output_graph.pb") parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)', default="sample_input.wav") parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network', default="models/alphabet.txt") parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file', default="models/lm.binary") parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie', default="models/trie") args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def main(model, alphabet, lm, trie, dest): # parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') # parser.add_argument('model', type=str, # help='Path to the model (protocol buffer binary file)') # parser.add_argument('alphabet', type=str, # help='Path to the configuration file specifying the alphabet used by the network') # parser.add_argument('lm', type=str, nargs='?', # help='Path to the language model binary file') # parser.add_argument('trie', type=str, nargs='?', # help='Path to the language model trie file created with native_client/generate_trie') # parser.add_argument('audio', type=str, # help='Path to the audio file to run (WAV format)') # args = parser.parse_args() # print(args); print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) # fs, audio = read_video(args.audio) #wav.read(args.audio) # return ; print('Running inference.', file=sys.stderr) clips = os.listdir(dest) ; # clips dir path subs = [] ; for i, clip in enumerate(clips) : fs, audio = wav.read(dest + str(i) + '.wav') ; if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(dest + str(i) + '.wav') audio_length = len(audio) * ( 1 / 16000) inference_start = timer() subs.append(ds.stt(audio, fs)) ; print(subs[len(subs) - 1]); inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) # break ; return subs ;
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', help='Path to the model (protocol buffer binary file)') parser.add_argument( 'alphabet', help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('audio', help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def setup_model(model_path, alphabet, lm, trie): if model_path and alphabet: print("creating model {} {}".format(model_path, alphabet)) ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) return ds_model return None
def load_model(): model_path = 'output_graph.pb' alphabet_path = 'alphabet.txt' lm_path = 'lm.binary' trie_path = 'trie' ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) return ds
def setup_model(model_path, alphabet, lm, trie): log("creating model {} {}...".format(model_path, alphabet)) ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) log("model is ready.") return ds_model
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', help='Path to the audio file to run (WAV format)') parser.add_argument('--version', help='Print version and exits') args = parser.parse_args() if args.version: print_versions() return 0 print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def loadModel(): global ds print('Loading model from file %s' % (MODEL_FILE), file=sys.stderr) model_load_start = timer() ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) print('Loading language model from files %s %s' % (LM_BINARY_FILE, TRIE_FILE), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(ALPHABET_FILE, LM_BINARY_FILE, TRIE_FILE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
def stt(audioPath): model = conf.get_config('model') alphabet = conf.get_config('alphabet') lm = conf.get_config('lm') trie = conf.get_config('trie') ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wav.read(audioPath) text = ds.stt(audio, fs) return text
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) for path in sorted(glob.glob(args.audio))[::1]: target = os.path.splitext(path)[0] + '.txt' if os.path.exists(target): continue fs, audio = wav.read(path) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference of %s.' % path, file=sys.stderr) inference_start = timer() text = ds.stt(audio, fs) print(text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) with open(target, 'w') as out: out.write(text)
class MozillaDeepSpeechASREngine(ASREngine): """https://github.com/mozilla/DeepSpeech""" def __init__(self, model_path, alphabet_path, language_model_path=None, trie_path=None): """ Constructor. :param model_path: Absolute path to (acoustic) model file. :param alphabet_path: Absolute path to file containing alphabet. :param language_model_path: Absolute path to language model file. This parameter is optional. Set to enable decoding with language model. :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model. """ # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py self._model = Model(aModelPath=model_path, aNCep=26, aNContext=9, aAlphabetConfigPath=alphabet_path, aBeamWidth=500) if language_model_path is not None and trie_path is not None: self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path, aLMPath=language_model_path, aTriePath=trie_path, aLMWeight=1.75, aWordCountWeight=1.0, aValidWordCountWeight=1.0) self._with_language_model = True else: self._with_language_model = False def transcribe(self, path): pcm, sample_rate = soundfile.read(path) pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16) return self._model.stt(pcm, aSampleRate=sample_rate) def __str__(self): if self._with_language_model: return 'Mozilla DeepSpeech (with language model)' else: return 'Mozilla DeepSpeech'
def load_model(model_path, alphabet_path, lm_path, trie_path): print('Loading model from file %s' % (model_path), file=sys.stderr) model_load_start = timer() ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm_path and trie_path: print('Loading language model from files %s %s' % (lm_path, trie_path), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) return ds
def load_model(): model_path = 'output_graph.pb' alphabet_path = 'alphabet.txt' lm_path = 'lm.binary' trie_path = 'trie' #print('Loading model from file %s' % (model_path), file=sys.stderr) #model_load_start = timer() ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH) #model_load_end = timer() - model_load_start #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) #print('Loading language model from files %s %s' % (lm_path, trie_path), file=sys.stderr) #lm_load_start = timer() ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) #lm_load_end = timer() - lm_load_start #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) return ds
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * ( 1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeech: """Wrap DeepSpeech and provide the methods we need""" def __init__(self, settings): self.beam_width = 1024 self.lm_weight = 1.75 self.word_count_weight = 1.00 self.valid_word_count_weight = 1.00 self.n_features = 26 self.n_context = 9 self.alphabet = settings.get('alphabet') self.lm = settings.get('lm') self.trie = settings.get('trie') self.graph = settings.get('graph') def load_model(self): start = timer() self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width) end = timer() print('Loaded model in %0.3fs.' % (end - start)) if self.lm is not None and self.trie is not None: start = timer() self.model.enableDecoderWithLM( self.alphabet, self.lm, self.trie, self.lm_weight, self.word_count_weight, self.valid_word_count_weight ) end = timer() print('Loaded language model in %0.3fs.' % (end - start)) def oneshoot(self, wav_file): fs, audio = wav.read(wav_file) start = timer() result = self.model.stt(audio, fs) latency = timer() - start audio_length = len(audio) * ( 1 / 16000) return result, latency
class SpeechToText(): def __init__(self, model_path): # Defined constants. See https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py BEAM_WIDTH = 500 LM_WEIGHT = 1.75 WORD_COUNT_WEIGHT = 1.00 VALID_WORD_COUNT_WEIGHT = 1.00 N_FEATURES = 26 N_CONTEXT = 9 model = os.path.join(model_path, "output_graph.pb") alphabet = os.path.join(model_path, "alphabet.txt") lm = os.path.join(model_path, "lm.binary") trie = os.path.join(model_path, "trie") self.model = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) self.model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) def run(self, audio, fs): return self.model.stt(audio, fs)
def load_model(args): BEAM_WIDTH = 500 LM_WEIGHT = 1.75 WORD_COUNT_WEIGHT = 1.00 VALID_WORD_COUNT_WEIGHT = 1.00 N_FEATURES = 26 N_CONTEXT = 9 print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) return ds
def load_ds_model(self): """ Loading the deepspeech module. return: deepspeech object """ logging.info('Loading model from file %s' % (self.model)) model_load_start = timer() ds = Model(self.model, self.N_FEATURES, self.N_CONTEXT, self.alphabet, self.BEAM_WIDTH) model_load_end = timer() - model_load_start logging.info('Loaded model in %0.3fs.' % (model_load_end)) # Load the lm and trie only if the path is given if self.lm and self.trie: logging.info('Loading language model from files %s %s' % (self.lm, self.trie)) lm_load_start = timer() ds.enableDecoderWithLM(self.alphabet, self.lm, self.trie, self.LM_WEIGHT, self.WORD_COUNT_WEIGHT, self.VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start logging.info('Loaded language model in %0.3fs.' % (lm_load_end)) return ds
def recognize_deepspeech(audio): model = path.join(path.dirname(path.realpath(__file__)), 'models/output_graph.pb') alphabet = path.join(path.dirname(path.realpath(__file__)), 'models/alphabet.txt') lm = path.join(path.dirname(path.realpath(__file__)), 'models/lm.binary') trie = path.join(path.dirname(path.realpath(__file__)), 'models/trie') #print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start #print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: #print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start #print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(audio) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(audio) audio_length = len(audio) * (1 / 16000) #print('Running inference.', file=sys.stderr) #inference_start = timer() #inference_end = timer() - inference_start return ds.stt(audio, fs)
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) args = parser.parse_args() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) if args.lm and args.trie: ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wav.read(args.audio) print(ds.stt(audio, fs))
def deepspeech_main(): # These constants control the beam search decoder # Beam width used in the CTC decoder when building candidate transcriptions BEAM_WIDTH = 500 # The alpha hyperparameter of the CTC decoder. Language Model weight LM_WEIGHT = 1.75 # The beta hyperparameter of the CTC decoder. Word insertion weight (penalty) WORD_COUNT_WEIGHT = 1.00 # Valid word insertion weight. This is used to lessen the word insertion penalty # when the inserted word is part of the vocabulary VALID_WORD_COUNT_WEIGHT = 1.00 # These constants are tied to the shape of the graph used (changing them changes # the geometry of the first layer), so make sure you use the same constants that # were used during training # Number of MFCC features to use N_FEATURES = 26 # Size of the context window used for producing timesteps in the input vector N_CONTEXT = 9 parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
if __name__ == '__main__': args, params, err = setup_args() if err: check_err(err) ds = Model( params["model"], params["n_features"], params["n_context"], params["alphabet"], params["beam_width"]) ds.enableDecoderWithLM( params["alphabet"], params["lm"], params["trie"], params["lm_weight"], params["word_count_weight"], params["valid_word_count_weight"]) logger = setup_logger() routes = { "/api/reco": SpeechRecognitionResource(ds) } api = setup_api(routes, middleware=logger) try: bjoern.run(api, host='0.0.0.0', port=args.port) except KeyboardInterrupt: sys.exit(0)