def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, default=0.75, help='Language model weight (lm_alpha)') parser.add_argument('--lm_beta', type=float, default=1.85, help='Word insertion bonus (lm_beta)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio))) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio))) else: print(ds.stt(audio)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def batching_after_silence( audio: np.ndarray, silence_threshold: int, model: Model, verbose: bool = False, filters: list = None, ) -> List[Any]: """ Infer after natural gaps of silence Ref: http://jamesmontgomery.us/blog/Voice_Recognition_Model.html """ results: list = [] audio = audio.astype("float32") y: np.ndarray = librosa.effects.split(audio, top_db=silence_threshold, ref=np.mean) clips: list = [] for i in tqdm(y): clip = audio[i[0] : i[1]] clip = clip.astype("int16") if filters: clip = apply_filters(clip, filters) clips.append((clip, filters or ["no filter"])) for clip, meta in tqdm(clips): transcripts = metadata_to_string(model.sttWithMetadata(clip, 1).transcripts[0]) if transcripts and verbose: print(transcripts, " : ", meta) results.append(transcripts) return results
def transcribe(audio_path): ds = Model(model_path="deepspeech-0.7.0-models.pbmm") desired_sample_rate = ds.sampleRate() print(desired_sample_rate) ds.enableExternalScorer("deepspeech-0.7.0-models.scorer") fin = wave.open(audio_path, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print("Converting from {}hz to {}hz" % (fs_orig, desired_sample_rate)) fs_new, audio = convert_samplerate(audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() inference_start = timer() transcript = ds.sttWithMetadata(audio, 1).transcripts[0] json_result = metadata_json_output(transcript) string_result = metadata_to_string(transcript) inference_end = timer() - inference_start print(json_result) print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return json_result, string_result
def get_deepspeech_result(data: bytes, model: deepspeech.Model) -> Tuple[str, float]: audio_array = bytes_to_array(data) start_time = time.time() response = model.sttWithMetadata(audio_array) end_time = time.time() response_time = end_time - start_time return metadata_to_string(response), response_time
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) writeFile = open('speechtotext.csv', 'w') writer = csv.writer(writeFile) writer.writerow(['inputfile', 'inference']) for file in glob.glob("{}*.wav".format(args.audio)): fin = wave.open(file, 'rb') fs = fin.getframerate() if fs != SAMPLE_RATE: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, SAMPLE_RATE), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/SAMPLE_RATE) fin.close() print('Running inference for {}'.format(file), file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: #print(ds.stt(audio, fs)) writer.writerow(["{}".format(file),"{}".format(ds.stt(audio, fs))]) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) writeFile.close()
def transcribe(args, filepath="", verbose=0): if verbose > 0: print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.beam_width) if verbose > 0: model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format( model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: if verbose > 0: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) if verbose > 0: lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format( lm_load_end), file=sys.stderr) fin = wave.open(filepath, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: if verbose > 0: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format( fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(filepath, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs) fin.close() if verbose > 0: print('Running inference.', file=sys.stderr) inference_start = timer() audio_metadata = ds.sttWithMetadata(audio) if verbose > 0: inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) dict_result = dict() dict_result["sentence"] = "".join( item.character for item in audio_metadata.items) dict_result["words"] = words_from_metadata(audio_metadata) dict_result["characters"] = audio_metadata dict_result["confidence"] = audio_metadata.confidence return dict_result
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def main(): audio_files = glob.glob("uploads/*.wav") speech_model = "deepspeech-0.9.3-models.pbmm" speech_scorer = "deepspeech-0.9.3-models.scorer" speech_audio = audio_files[0] print('Loading model from file', file=sys.stderr) model_load_start = timer() ds = Model(speech_model) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() print('Loading scorer from files', file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(speech_scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) fin = wave.open(speech_audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(speech_audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() inference_end = timer() - inference_start audio_transcription = metadata_json_output(ds.sttWithMetadata(audio, 3)) print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) print('Candidate Transcritps:', 3) return audio_transcription
def stt(model_path, audio, beam_width=None, scorer_path=None, lm_alpha=None, lm_beta=None, hot_words=None): ds = Model(model_path) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer_path: ds.enableExternalScorer(scorer_path) if lm_alpha and lm_beta: ds.setScorerAlphaBeta(lm_alpha, lm_beta) # TODO # if hot_words: # print('Adding hot-words', file=sys.stderr) # for w in hot_words: # ds.addHotWord(w, 6.2) fin = wave.open(audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( f'ERROR: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz.', file=sys.stderr) exit(1) audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() print('Running inference.', file=sys.stderr) res = ds.sttWithMetadata(audio, 1) res = postprocess_metadata(res) return res
def MozillaSTT(audio_path): # TODO: handle different rates (not implemented) fin = wave.open(audio_path, 'rb') output = "" # print("SS") ds = Model(model_file_path) # print("SS") ds.enableExternalScorer(scorer_file_path) # print("SS") lm_alpha = 0.75 # ?? lm_beta = 1.85 desired_sample_rate = ds.sampleRate() ds.setScorerAlphaBeta(lm_alpha, lm_beta) fs_orig = fin.getframerate() # print("Desired Sampling Rate: %d", desired_sample_rate) if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. \ Resampling might produce erratic speech recognition.'.format( fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) # audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) # print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) # print(metadata_json_output(ds.sttWithMetadata(audio, 3))) # print(ds.stt(audio)) output += ds.stt(audio) output += '\n' output += metadata_json_output(ds.sttWithMetadata(audio, 3)) return output
# -*- coding: utf-8 -*- """ Created on Sat Mar 6 15:41:29 2021 @author: Marshall.McDougall """ #import argparse #import numpy as np #import shlex #import subprocess #import sys #import wave #import json from deepspeech import Model #, version #from timeit import default_timer as timer #try: # from shhlex import quote #except ImportError: # from pipes import quote # deepspeech --model deepspeech-0.9.3-models.pbmm --scorer deepspeech-0.9.3-models.scorer --audio audio/SimpleTest3.wav --json ds = Model("deepspeech-0.9.3-models.pbmm") desired_sample_rate = ds.sampleRate() ds.enableExternalScorer("deepspeech-0.9.3-models.scorer") ds.sttWithMetadata("audio/SimpleTest3.wav", "3")
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') # parser.add_argument('--version', action=VersionAction, # help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') parser.add_argument('--hot_words', type=str, help='Hot-words and their boosts.') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) if args.hot_words: print('Adding hot-words', file=sys.stderr) for word_boost in args.hot_words.split(','): word,boost = word_boost.split(':') ds.addHotWord(word,float(boost)) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print(ds.stt(audio)) test = ds.createStream().sentencefit(audio, "ka arohia katoatia te hāhi me ōna whakapono e te hapū o ōtākou") [print(f"letter: {t.letter}, confidence: {t.confidence}, timestep: {t.timestep}") for t in test.tokens] # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
class DeepSpeech(): def __init__(self, model_path, scorer_path, result_json_path, result_txt_path, candidate_transcripts=3, beam_width=None): # Path to the Speech-To-Text model self.MODEL_PATH = model_path # Path to the scorer language mode self.SCORER_PATH = scorer_path # The number of times to trascript self.CANDIDATE_TRANSCRIPTS = candidate_transcripts self.result_json_path = result_json_path self.result_txt_path = result_txt_path self.beam_width = beam_width self._setup() def _setup(self): self.ds = Model(self.MODEL_PATH) # Declare the model obj # Set desired sample rate for STT model. self.sample_rate = '16000' if self.beam_width: self.ds.setBeamWidth(self.beam_width) if self.SCORER_PATH: self.ds.enableExternalScorer(self.SCORER_PATH) def convert_samplerate(self, audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {}\ --encoding signed-integer --endian little\ --compression 0.0 --no-dither - '\ .format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output( shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError( 'SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}' .format(desired_sample_rate, e.strerror)) return desired_sample_rate, np.frombuffer(output, np.int16) def words_from_candidate_transcript(self, metadata): word = "" word_list = [] word_start_time = 0 # Loop through each character for i, token in enumerate(metadata.tokens): # Append character to word if it's not a space if token.text != " ": if len(word) == 0: # Log the start time of the new word word_start_time = token.start_time word = word + token.text # Word boundary is either a space or the last character in the arr if token.text == " " or i == len(metadata.tokens) - 1: word_duration = token.start_time - word_start_time if word_duration < 0: word_duration = 0 each_word = dict() each_word["word"] = word each_word["start_time "] = round(word_start_time, 4) each_word["duration"] = round(word_duration, 4) word_list.append(each_word) # Reset word = "" word_start_time = 0 return word_list def metadata_json_output(self, metadata): json_result = dict() json_result["transcripts"] = [{ "confidence": transcript.confidence, "words": self.words_from_candidate_transcript(transcript), } for transcript in metadata.transcripts] return json.dumps(json_result, indent=4) def take_audio_info(self): probe = ffmpeg.probe(self.FILE_PATH) self.audio_info = next( (stream for stream in probe['streams'] if stream['codec_type'] == 'audio'), None) print(self.audio_info) return self.audio_info def take_audio(self): out, err = ( ffmpeg .input(self.FILE_PATH) .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar=self.sample_rate) .run(capture_stdout=True, capture_stderr=True) ) self.audio = np.frombuffer(out, np.int16) return self.audio def speech2text(self): metadata = self.ds.sttWithMetadata( self.audio, self.CANDIDATE_TRANSCRIPTS) json_result = self.metadata_json_output(metadata) with open(self.result_json_path, 'w') as outfile: outfile.write(json_result) dict_result = json.loads(json_result) word_list = [item["word"] for item in dict_result["transcripts"][0]["words"]] sentence = " ".join(word_list) self.export2textfile(sentence) return sentence def export2textfile(self, sentence): txt_file = open(self.result_txt_path, "w") txt_file.writelines(sentence) txt_file.close() def set_file(self, filepath): self.FILE_PATH = filepath
def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, help='Language model weight (lm_alpha). If not specified, use default from the scorer package.') parser.add_argument('--lm_beta', type=float, help='Word insertion bonus (lm_beta). If not specified, use default from the scorer package.') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument('--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') parser.add_argument('--candidate_transcripts', type=int, default=3, help='Number of candidate transcripts to include in JSON output') args = parser.parse_args() # print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start # print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: # print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start # print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/fs_orig) fin.close() # print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts))) else: print("Translation: "+ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start
def main(): # parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') # parser.add_argument('--model', required=True, # help='Path to the model (protocol buffer binary file)') # parser.add_argument('--alphabet', required=True, # help='Path to the configuration file specifying the alphabet used by the network') # parser.add_argument('--lm', nargs='?', # help='Path to the language model binary file') # parser.add_argument('--trie', nargs='?', # help='Path to the language model trie file created with native_client/generate_trie') # parser.add_argument('--audio', required=True, # help='Path to the audio file to run (WAV format)') # parser.add_argument('--version', action=VersionAction, # help='Print version and exits') # parser.add_argument('--extended', required=False, action='store_true', # help='Output string from extended metadata') # args = parser.parse_args() args = { 'alphabet': 'models/alphabet.txt', 'audio': 'input/test.wav', 'extended': False, 'lm': 'models/lm.binary', 'model': 'models/output_graph.pbmm', 'trie': 'models/trie', 'version': None } # print("-----------------------------",args['model']) # for key, value in args.items(): # print (key, value) print('Loading model from file {}'.format(args['model']), file=sys.stderr) model_load_start = timer() ds = Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args['lm'] and args['trie']: print('Loading language model from files {} {}'.format( args['lm'], args['trie']), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args['audio'], 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(args['audio']) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args['extended']: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def speakSpell(audioFile): TEXT = 'something went wrong' def convert_samplerate(audio_path): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate 16000 --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path)) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use 16kHz files or install it: {}'.format(e.strerror)) return 16000, np.frombuffer(output, np.int16) def metadata_to_string(metadata): return ''.join(item.character for item in metadata.items) # Load DeepSpeech model #if __name__ == '__main__': BEAM_WIDTH = 500 #Beam width used in the CTC decoder when building candidate transcriptions. Default: 500 LM_ALPHA = 0.75 #The alpha hyperparameter of the CTC decoder. Language Model weight. Default: 0.75 LM_BETA = 1.85 #The beta hyperparameter of the CTC decoder. Word insertion bonus. Default: 1.85 N_FEATURES = 26 #Number of MFCC features to use. Default: 26 N_CONTEXT = 9 #Size of the context window used for producing timesteps in the input vector. Default: 9 #MOD = str(getFile('http://map-courses.usc.edu/codecollective/CCC/IVO/models/output_graph.pbmm', 'output_graph.pbmm')) #WILL NEED TO HOST ELSEWHERE WITH HIGHER SPEED/SIZE...CORS ISSUE?? MOD = 'models/output_graph.pbmm' #ALPHABET = str(getFile('http://map-courses.usc.edu/codecollective/CCC/IVO/models/alphabet.txt', 'alphabet.txt')) ALPHABET = 'models/alphabet.txt' LM = ''#'lm.binary' TRIE = ''#'trie'#'models/trie' EXTENDED = '' VAD = 3 #int 0-3 higher is more aggressive filters out more non-speech SAVEWAV = 'STTaudio' #folder name for files if SAVEWAV: os.makedirs(SAVEWAV, exist_ok=True) #main() ''' if os.path.isdir(MOD): model_dir = MOD MOD = os.path.join(model_dir, 'output_graph.pb') ALPHABET = os.path.join(model_dir, ALPHABET if ALPHABET else 'alphabet.txt') LM = os.path.join(model_dir, LM) TRIE = os.path.join(model_dir, TRIE) ''' print('Initializing model...') #self.wfile.write(str('initializing model')) #global model model_load_start = timer() model = Model(MOD, N_FEATURES, N_CONTEXT, ALPHABET, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) #self.wfile.write(str('loaded model')) if LM and TRIE: lm_load_start = timer() print('Loading language model from files {} {}'.format(LM, TRIE), file=sys.stderr) model.enableDecoderWithLM(ALPHABET, LM, TRIE, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) #then do stuff here #GET URL HERE WILL BE A CALL FROM CLIENT, A SOCKET MSG W URL #FILE_URL = 'http://map-courses.usc.edu/codecollective/CCC/IVO/models/speak_2019-08-09_15-53-06_972543.wav' #FILE_NAME = str(os.path.join(SAVEWAV, datetime.now().strftime("speak_%Y-%m-%d_%H-%M-%S_%f.wav"))) FILE = audioFile #'speak_2019-08-09_15-53-06_972543.wav' #change to get file: #str(getFile(FILE_URL, FILE_NAME)) fin = wave.open(FILE, 'rb')#wave.open(FILE_NAME, 'rb') fs = fin.getframerate() ''' if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(audio) #convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) ''' #UnboundLocalError: local variable 'audio' referenced before assignment audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) #self.wfile.write(str('running inference')) inference_start = timer() if EXTENDED: #if args.extended: print(metadata_to_string(model.sttWithMetadata(audio, fs))) TEXT = metadata_to_string(model.sttWithMetadata(audio, fs)) else: print(model.stt(audio, fs)) TEXT = str(model.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) #self.wfile.write(str('finished inference')) ''' global FILE_ID FILE_ID = uploadFile(FILE_NAME) #added this gqlMutateText(FILE_ID, TEXT) lyreBird(FILE_ID, TEXT) ''' #THEN ADD SOCKET EMIT TO TELL CLIENT TO PULL THE NEW FILE AND POPULATE THE PAGE #return Response("<h1>Flask on Now Zero Config</h1><p>You visited: /%s</p>" % (path), mimetype="text/html") #return Response("<h1>Flask on Now Zero Config</h1><p>DeepSpeech heard: %s </p>" % TEXT, mimetype="text/html") print(TEXT) return TEXT
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ FORMAT = pyaudio.paInt16 SAMPLE_RATE = 16000 CHANNELS = 1 BLOCKS_PER_SECOND = 50 def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) self.vad = webrtcvad.Vad(mode=3) self.sample_rate = self.SAMPLE_RATE self.buffer_queue = queue.Queue() def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def deep_stream(self): return self.model.createStream() def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30): """ Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ # audio = np.frombuffer(audio, np.int16) n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n
class SpeechToTextEngine: """ Class to perform speech-to-text transcription and related functionality """ def __init__(self, scorer='deepspeech_model.scorer') -> None: """ Initializing the DeepSpeech model """ self.model = Model(model_path=Path(__file__).parents[2].joinpath( 'deepspeech_model.pbmm').absolute().as_posix()) self.model.enableExternalScorer(scorer_path=Path( __file__).parents[2].joinpath(scorer).absolute().as_posix()) def run(self, audio) -> str: """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the transcribe audio in string format.""" normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.stt(audio_buffer=audio_streams) return results def run_with_metadata(self, audio) -> Metadata: normalized_audio = normalize_audio_input(audio) audio_streams = BytesIO(normalized_audio) with wave.Wave_read(audio_streams) as wav: audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16) results = self.model.sttWithMetadata(audio_buffer=audio_streams) return results def add_hot_words(self, data) -> list: """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the added hot-words """ all_hot_words = [] try: logger.info('----------------------------------------------------') for hot_word in data: # Change all the characters of the hot-word to lower case word = hot_word.lower() # Get numeric value of the boost boost = float(data.get(hot_word)) # Adding the hot-word and its boost to the language model self.model.addHotWord(hot_word, boost) # Printing on the prompt the activity logger.info( f"`{word}` hot-word with boost `{boost}` was added.") all_hot_words.append(word) return all_hot_words except RuntimeError: return [] def erase_hot_word(self, hot_words) -> None: try: for hot_word in hot_words: self.model.eraseHotWord(hot_word) logger.info(f"`{hot_word}` hot-word is erased.") logger.info('----------------------------------------------------') except RuntimeError: return def clear_hot_words(self) -> str: try: self.model.clearHotWords() return f"All hot-words were erased." except RuntimeError: return f"No more hot-words are left." def sample_rate(self): return self.model.sampleRate()
fin = wave.open(file_path, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(file_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) # audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() inference = ds.sttWithMetadata(audio, 1).transcripts[0] transcript.append(''.join(token.text for token in inference.tokens)) print("final transcript:\n\n{}\n\n".format(transcript)) client.publish(TEST_OUTPUT_TOPIC, json.dumps(transcript)) print("I got:\n{}".format(transcript)) # This is just a simplified mockup of the deepspeech demo code, # I stole wavSplit file also from the demo code if __name__ == "__main__" and False: # locate model files model_path = os.path.join(os.getcwd(), "models") print("model_path: {}".format(model_path)) pb = glob.glob(model_path + "/*.pbmm")[0]
def main_transcript(video_to_encode): msg = "" mp3file = video_to_encode.get_video_mp3( ).source_file if video_to_encode.get_video_mp3() else None lang = video_to_encode.main_lang # check if DS_PARAM [lang] exist if not DS_PARAM.get(lang): msg += "\n no deepspeech model found for lang:%s." % lang msg += "Please add it in DS_PARAM." return msg ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width']) if all([ cond in DS_PARAM[lang] for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta'] ]): ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'], DS_PARAM[lang]['trie'], DS_PARAM[lang]['lm_alpha'], DS_PARAM[lang]['lm_beta']) desired_sample_rate = ds_model.sampleRate() webvtt = WebVTT() inference_start = timer() last_item = None sentences = [] sentence = [] metadata = None for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME): end_trim = video_to_encode.duration if start_trim + \ AUDIO_SPLIT_TIME > video_to_encode.duration else ( start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \ AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \ else (video_to_encode.duration - start_trim) msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, duration) audio = convert_samplerate(mp3file.path, desired_sample_rate, start_trim, duration) msg += '\nRunning inference.' metadata = ds_model.sttWithMetadata(audio) msg += '\nConfidence : %s' % metadata.confidence sentences[:] = [] # empty list sentence[:] = [] # empty list refItem = metadata.items[0] index = get_index(metadata, last_item, start_trim) if last_item else 0 # nb of character in AUDIO_SPLIT_TIME msg += "METADATA ITEMS : %d " % len(metadata.items) sentences = get_sentences(metadata, refItem, index) last_item = ( sentences[-1][-1].character, sentences[-1][-1].start_time) if len(sentences) > 0 else () for sent in sentences: if len(sent) > 0: start_time = sent[0].start_time + start_trim end_time = sent[-1].start_time + start_trim str_sentence = ''.join(item.character for item in sent) # print(start_time, end_time, str_sentence) caption = Caption( '%s.%s' % (timedelta(seconds=int(str(start_time).split('.')[0])), str('%.3f' % start_time).split('.')[1]), '%s.%s' % (timedelta(seconds=int(str(end_time).split('.')[0])), str('%.3f' % end_time).split('.')[1]), ['%s' % str_sentence]) webvtt.captions.append(caption) # print(webvtt) msg += saveVTT(video_to_encode, webvtt) inference_end = timer() - inference_start msg += '\nInference took %0.3fs.' % inference_end # print(msg) return msg
class DeepSpeechWrapper: def __init__(self, dir): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument( '--model', default=os.path.join(dir, 'output_graph.pbmm'), help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', default=os.path.join(dir, 'alphabet.txt'), help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', default=os.path.join(dir, 'lm.binary'), help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', default=os.path.join(dir, 'trie'), help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') self.args = parser.parse_args('') # shadow the system args self.ds = Model(self.args.model, N_FEATURES, N_CONTEXT, self.args.alphabet, BEAM_WIDTH) self.audio = None self.audio_length = 0 self.fs = 16000 if self.args.lm and self.args.trie: # print('Loading language model from files {} {}'.format(self.args.lm, self.args.trie), file=sys.stderr) # lm_load_start = timer() self.ds.enableDecoderWithLM(self.args.alphabet, self.args.lm, self.args.trie, LM_ALPHA, LM_BETA) # lm_load_end = timer() - lm_load_start # print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) def set_input(self, filename): fin = wave.open(filename, 'rb') self.fs = fin.getframerate() if self.fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(self.fs), file=sys.stderr) self.fs, audio = convert_samplerate(filename) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) self.audio = audio self.audio_length = fin.getnframes() / self.fs fin.close() def recognize_audio(self, start_time, end_time): start_frame = int(start_time * self.fs) end_frame = int(end_time * self.fs) seq = self.audio[start_frame:end_frame] if self.args.extended: return metadata_to_string(self.ds.sttWithMetadata(seq, self.fs)) else: return self.ds.stt(seq, self.fs)