def main_deepspeech(args): args = parse_args_deep() if args is None else args print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wave.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def build_model(init_settings): """ :param init_settings: Configparser :return: """ print('Loading DeepSpeech Models') try: ds = Model(str(init_settings['deepspeech']['model_path']), int(init_settings['deepspeech']['N_FEATURES']), int(init_settings['deepspeech']['N_CONTEXT']), str(init_settings['deepspeech']['alphabet_path']), int(init_settings['deepspeech']['BEAM_WIDTH'])) ds.enableDecoderWithLM( str(init_settings['deepspeech']['alphabet_path']), str(init_settings['deepspeech']['lm_path']), str(init_settings['deepspeech']['trie_path']), float(init_settings['deepspeech']['LM_WEIGHT']), float(init_settings['deepspeech']['WORD_COUNT_WEIGHT']), float(init_settings['deepspeech']['VALID_WORD_COUNT_WEIGHT'])) return ds except Exception as e: print('Loading Error!') print(e) return None
class DeepSpeechImp: ds = None def __init__(self): logging.info('Loading model from file %s' % (shared_params.DS_MODEL)) model_load_start = timer() self.ds = Model(shared_params.DS_MODEL, N_FEATURES, N_CONTEXT, shared_params.DS_ALPHABET, shared_params.BEAM_WIDTH) model_load_end = timer() - model_load_start logging.info('Loaded model in %0.3fs.' % (model_load_end)) logging.info('Loading language model from files %s %s' % (shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE)) lm_load_start = timer() self.ds.enableDecoderWithLM(shared_params.DS_ALPHABET, shared_params.DS_LANGUAGE_MODEL, shared_params.DS_TRIE, shared_params.LM_WEIGHT, shared_params.WORD_COUNT_WEIGHT, shared_params.VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start logging.info('Loaded language model in %0.3fs.' % (lm_load_end)) def process_audio(self, audio_path): try: fs, audio = wav.read(audio_path) return self.ds.stt(audio, fs) except Exception as ex: logging.error(str(ex)) return "" def __del__(self): del self.ds
def __init__(self, model_path, alphabet_path, language_model_path=None, trie_path=None): """ Constructor. :param model_path: Absolute path to (acoustic) model file. :param alphabet_path: Absolute path to file containing alphabet. :param language_model_path: Absolute path to language model file. This parameter is optional. Set to enable decoding with language model. :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model. """ # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py self._model = Model(aModelPath=model_path, aNCep=26, aNContext=9, aAlphabetConfigPath=alphabet_path, aBeamWidth=500) if language_model_path is not None and trie_path is not None: self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path, aLMPath=language_model_path, aTriePath=trie_path, aLMWeight=1.75, aWordCountWeight=1.0, aValidWordCountWeight=1.0) self._with_language_model = True else: self._with_language_model = False
def main(): model = "models/output_graph.pb" alphabet = "models/alphabet.txt" lm = "models/lm.binary" trie = "models/trie" ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) with open("flickr_audio_transcription.txt", "w") as out: for audio_f in glob.glob( "/roaming/gchrupal/vgs/data/flickr8k/flickr_audio/wavs/*.wav"): print("Transcribing {}".format(audio_f)) try: fs, audio = wav.read(audio_f) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) basename, ext = os.path.splitext(os.path.basename(audio_f)) out.write("{}\t{}\n".format(basename, ds.stt(audio, fs))) out.flush() except ValueError as e: print("Error: {}".format(e))
def recognize(model="../models/output_graph.pb", audio="../audio/2830-3980-0043.wav", alphabet="../models/alphabet.txt", lm="../models/lm.binary", trie="../models/trie"): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() result = ds.stt(audio, fs) print(result, file=sys.stderr) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return result
def load_model(): args = { 'model': './models/output_graph.pb', 'alphabet': './models/alphabet.txt', 'lm': './models/lm.binary', 'trie': './models/trie', 'audio': './sample_input.wav' } print('Loading model from file {}'.format(args['model']), file=sys.stderr) model_load_start = timer() ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args['lm'] and args['trie']: print('Loading language model from files {} {}'.format( args['lm'], args['trie']), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], aLMWeight=LM_WEIGHT, aValidWordCountWeight=VALID_WORD_COUNT_WEIGHT, aWordCountWeight=WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) return ds
class DeepSpeech: def __init__(self, model, alphabet, lm=None, trie=None): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm is not None and trie is not None: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def stt(self, audio_file): fs, audio = wav.read(audio_file) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() stt_result = self.ds.stt(audio, fs) print('Return result: ', stt_result) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return stt_result
def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500): print('Loading model from file %s' % modelPath, file=sys.stderr) model_load_start = timer() self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth) #self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr)
def __init__(self, vocabulary, graph="models/output_graph.pb", alphabet="models/alphabet.txt"): self._logger = logging.getLogger(__name__) self._logger.debug("Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'", graph, alphabet) self._model = Model(graph, 26, 9, alphabet, 500)
class SpeechRecognizer: def __init__(self): self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH, TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) def speech_to_text(self, audio_buffer, sample_rate): app.logger.info('processing audio file') audio = self._process_audio_data(audio_buffer, sample_rate) app.logger.info('starting recognition') start = time() text = self._model.stt(audio, SAMPLE_RATE) end = time() app.logger.info('finished in {:.3f}s'.format(end - start)) return text def _process_audio_data(self, audio_buffer, original_sample_rate): audio = np.frombuffer(audio_buffer, dtype=np.int16) if original_sample_rate != SAMPLE_RATE: audio = self._resample(audio, original_sample_rate) return audio def _resample(self, audio, original_sample_rate): audio_length = len(audio) / original_sample_rate samples = int(audio_length * SAMPLE_RATE) return signal.resample(audio, samples).astype(np.int16)
def _worker_thread(self): print('restoring from {}'.format(model_file)) model = Model(model_file, N_INPUT, N_CONTEXT, ALPHABET_CONFIG_PATH, BEAM_WIDTH) model.enableDecoderWithLM(ALPHABET_CONFIG_PATH, LM_BINARY_PATH, LM_TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) while True: cmd, *args = self._queue.get() if cmd == 'sample': sample = args[0] file = wave.open(sample.wav_path) audio = np.frombuffer(file.readframes(file.getnframes()), dtype=np.int16) fs = file.getframerate() start = time.time() result = model.stt(audio, fs) inference_time = time.time() - start wav_time = wav_length(sample.wav_path) print('wav length: {}\ninference time: {}\nRTF: {:2f}'.format( wav_time, inference_time, inference_time / wav_time)) self.inference_done.emit(sample, result) elif cmd == 'stop': break sess.close()
class transciber(object): def __init__(self, modelPath, alphabet, lmPath, trie, numFeatures=26, numContext=9, beamWidth=500): print('Loading model from file %s' % modelPath, file=sys.stderr) model_load_start = timer() self.model = Model(modelPath, numFeatures, numContext, alphabet, beamWidth) self.model.enableDecoderWithLM(alphabet, lmPath, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) def transcribe(self, audioPath): fs, audio = wav.read(audioPath) audio_length = len(audio) * (1 / 16000) label = self.model.stt(audio, fs) print(label) return label
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)', default="models/output_graph.pb") parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)', default="sample_input.wav") parser.add_argument( 'alphabet', type=str, help= 'Path to the configuration file specifying the alphabet used by the network', default="models/alphabet.txt") parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file', default="models/lm.binary") parser.add_argument( 'trie', type=str, nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie', default="models/trie") args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def __init__(self): self._model = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) self._model.enableDecoderWithLM(ALPHABET_PATH, LANGUAGE_MODEL_PATH, TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
def main(model, alphabet, lm, trie, dest): # parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') # parser.add_argument('model', type=str, # help='Path to the model (protocol buffer binary file)') # parser.add_argument('alphabet', type=str, # help='Path to the configuration file specifying the alphabet used by the network') # parser.add_argument('lm', type=str, nargs='?', # help='Path to the language model binary file') # parser.add_argument('trie', type=str, nargs='?', # help='Path to the language model trie file created with native_client/generate_trie') # parser.add_argument('audio', type=str, # help='Path to the audio file to run (WAV format)') # args = parser.parse_args() # print(args); print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) # fs, audio = read_video(args.audio) #wav.read(args.audio) # return ; print('Running inference.', file=sys.stderr) clips = os.listdir(dest) ; # clips dir path subs = [] ; for i, clip in enumerate(clips) : fs, audio = wav.read(dest + str(i) + '.wav') ; if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(dest + str(i) + '.wav') audio_length = len(audio) * ( 1 / 16000) inference_start = timer() subs.append(ds.stt(audio, fs)) ; print(subs[len(subs) - 1]); inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) # break ; return subs ;
def main(): parser = argparse.ArgumentParser( description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', help='Path to the model (protocol buffer binary file)') parser.add_argument( 'alphabet', help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('lm', nargs='?', help='Path to the language model binary file') parser.add_argument( 'trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('audio', help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print( 'Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * (1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def setup_model(model_path, alphabet, lm, trie): if model_path and alphabet: print("creating model {} {}".format(model_path, alphabet)) ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) return ds_model return None
def __init__(self, *args, **kwargs): plugin.STTPlugin.__init__(self, *args, **kwargs) self._logger = logging.getLogger(__name__) self._plugin_config = self.profile['deepspeech'] graph = self._plugin_config['graph'] alphabet = self._plugin_config['alphabet'] self._logger.debug( "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'", graph, alphabet) self._model = Model(graph, 26, 9, alphabet, 500)
def load_model(): model_path = 'output_graph.pb' alphabet_path = 'alphabet.txt' lm_path = 'lm.binary' trie_path = 'trie' ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) return ds
def setup_model(model_path, alphabet, lm, trie): log("creating model {} {}...".format(model_path, alphabet)) ds_model = Model(model_path, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds_model.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) log("model is ready.") return ds_model
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', help='Path to the audio file to run (WAV format)') parser.add_argument('--version', help='Print version and exits') args = parser.parse_args() if args.version: print_versions() return 0 print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def load_model(self): start = timer() self.model = Model(self.graph, self.n_features, self.n_context, self.alphabet, self.beam_width) end = timer() print('Loaded model in %0.3fs.' % (end - start)) if self.lm is not None and self.trie is not None: start = timer() self.model.enableDecoderWithLM( self.alphabet, self.lm, self.trie, self.lm_weight, self.word_count_weight, self.valid_word_count_weight ) end = timer() print('Loaded language model in %0.3fs.' % (end - start))
def loadModel(): global ds print('Loading model from file %s' % (MODEL_FILE), file=sys.stderr) model_load_start = timer() ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) print('Loading language model from files %s %s' % (LM_BINARY_FILE, TRIE_FILE), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(ALPHABET_FILE, LM_BINARY_FILE, TRIE_FILE, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
def stt(audioPath): model = conf.get_config('model') alphabet = conf.get_config('alphabet') lm = conf.get_config('lm') trie = conf.get_config('trie') ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) if lm and trie: ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) fs, audio = wav.read(audioPath) text = ds.stt(audio, fs) return text
def load_model(self): print('Loading model from file %s' % (MODEL_PATH)) model_load_start = timer() self.ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end)) print('Loading language model from files %s %s' % (LM_PATH, TRIE_PATH)) lm_load_start = timer() self.ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end))
class DeepSpeechSTT(plugin.STTPlugin): """ DeepSpeech Speech-to-Text implementation """ def __init__(self, *args, **kwargs): plugin.STTPlugin.__init__(self, *args, **kwargs) self._logger = logging.getLogger(__name__) self._plugin_config = self.profile['deepspeech'] graph = self._plugin_config['graph'] alphabet = self._plugin_config['alphabet'] self._logger.debug( "Initializing DeepSpeech with graph '%s' " + "and alphabet '%s'", graph, alphabet) self._model = Model(graph, 26, 9, alphabet, 500) def transcribe(self, fp): """ Performs STT, transcribing an audio file and returning the result. Arguments: fp -- a file object containing audio data """ fs, audio = wav.read(fp) return self._model.stt(audio, fs)
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) for path in sorted(glob.glob(args.audio))[::1]: target = os.path.splitext(path)[0] + '.txt' if os.path.exists(target): continue fs, audio = wav.read(path) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference of %s.' % path, file=sys.stderr) inference_start = timer() text = ds.stt(audio, fs) print(text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) with open(target, 'w') as out: out.write(text)
def main(): print('Loading model from file %s' % MODEL, file=sys.stderr) model_load_start = timer() ds = Model(MODEL, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % model_load_end, file=sys.stderr) # Uncomment if you want to use a language model # ============================================= # print('Loading language model from files %s %s' % (LANGUAGE_MODEL, TRIE), file=sys.stderr) # lm_load_start = timer() # ds.enableDecoderWithLM(ALPHABET, LANGUAGE_MODEL, TRIE, LM_WEIGHT, # WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) # lm_load_end = timer() - lm_load_start # print('Loaded language model in %0.3fs.' % lm_load_end, file=sys.stderr) # audio file path_to_audio = 'data/sesq316qna.mp3' # change rate of audio file to 16kHz call = AudioSegment.from_file(path_to_audio) call = call.set_frame_rate(16000) # only analyze the first 2 minutes (2 * 60 * 1000) segment = call[:120000] # declare the new name of the audio file path = 'data/testing.wav' # export the audio file to wav format segment.export(path, format="wav") # read the new file again with the wav reader fs, audio = wav.read(path) # We can assume 16kHz audio_length = len(audio) * (1 / 16000) assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" print('Running inference.', file=sys.stderr) inference_start = timer() prediction_text = ds.stt(audio, fs) print(prediction_text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def __init__(self, model, alphabet, lm=None, trie=None): print('Loading model from file %s' % (model), file=sys.stderr) model_load_start = timer() self.ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if lm is not None and trie is not None: print('Loading language model from files %s %s' % (lm, trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
class MozillaDeepSpeechASREngine(ASREngine): """https://github.com/mozilla/DeepSpeech""" def __init__(self, model_path, alphabet_path, language_model_path=None, trie_path=None): """ Constructor. :param model_path: Absolute path to (acoustic) model file. :param alphabet_path: Absolute path to file containing alphabet. :param language_model_path: Absolute path to language model file. This parameter is optional. Set to enable decoding with language model. :param trie_path: Absolute path to trie. This parameter is optional. Set to enable decoding with language model. """ # https://github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py self._model = Model(aModelPath=model_path, aNCep=26, aNContext=9, aAlphabetConfigPath=alphabet_path, aBeamWidth=500) if language_model_path is not None and trie_path is not None: self._model.enableDecoderWithLM(aAlphabetConfigPath=alphabet_path, aLMPath=language_model_path, aTriePath=trie_path, aLMWeight=1.75, aWordCountWeight=1.0, aValidWordCountWeight=1.0) self._with_language_model = True else: self._with_language_model = False def transcribe(self, path): pcm, sample_rate = soundfile.read(path) pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16) return self._model.stt(pcm, aSampleRate=sample_rate) def __str__(self): if self._with_language_model: return 'Mozilla DeepSpeech (with language model)' else: return 'Mozilla DeepSpeech'
def main(): parser = argparse.ArgumentParser(description='Benchmarking tooling for DeepSpeech native_client.') parser.add_argument('model', type=str, help='Path to the model (protocol buffer binary file)') parser.add_argument('alphabet', type=str, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('lm', type=str, nargs='?', help='Path to the language model binary file') parser.add_argument('trie', type=str, nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('audio', type=str, help='Path to the audio file to run (WAV format)') args = parser.parse_args() print('Loading model from file %s' % (args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files %s %s' % (args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) fs, audio = wav.read(args.audio) if fs != 16000: if fs < 16000: print('Warning: original sample rate (%d) is lower than 16kHz. Up-sampling might produce erratic speech recognition.' % (fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) audio_length = len(audio) * ( 1 / 16000) print('Running inference.', file=sys.stderr) inference_start = timer() print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)