def execute(self, audio, language=None): kaldi = KaldiRecognizer(self.model, 16000) kaldi.AcceptWaveform(audio.get_wav_data()) res = kaldi.FinalResult() res = json.loads(res) return res["text"]
import subprocess import srt import json import datetime SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) WORDS_PER_LINE = 7 def transcribe(): results = [] subs = [] while True:
import os import wave if not os.path.exists("model"): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") # You can also specify the possible word list rec = KaldiRecognizer(model, wf.getframerate(), "zero oh one two three four five six seven eight nine") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
#!/usr/bin/env python3 import math import struct import audioop from time import sleep from vosk import Model, KaldiRecognizer import pyaudio model = Model("vosk-model-small-ru-0.4") rec = KaldiRecognizer(model, 16000) Threshold = 400 SHORT_NORMALIZE = (1.0 / 32768.0) sample_width = 16000 def get_audio(): p = pyaudio.PyAudio() stream = p.open(input_device_index=6, format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4000) frames = [] cnt = 0 print('start')
#!/usr/bin/python3 from vosk import Model, KaldiRecognizer import sys import json import os if not os.path.exists("model-en"): print ("Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder.") exit (1) model = Model("model-en") # You can also specify the possible word list rec = KaldiRecognizer(model, 16000, "zero oh one two three four five six seven eight nine") wf = open(sys.argv[1], "rb") wf.read(44) # skip header while True: data = wf.read(2000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) print (res) else: res = json.loads(rec.PartialResult()) print (res)
import sys import os import wave SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print('RESULTADOS :', rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
class VoiceController: def __init__(self, api_key, wake_word="computer", region="eastus", languages=None): self.__api_key = api_key self.__wake_word = wake_word self.__region = region self.__languages = languages if not self.__languages: self.__languages = ["en-US"] # Are we currently processing a command? self.__active = False # Samples per second to read from mic (single channel) self.__framerate = 44100 self.__config = speechsdk.SpeechConfig(subscription=self.__api_key, region=self.__region) # Write sounds to this stream to send it to azure's speech recognition self.__hq_stream = speechsdk.audio.PushAudioInputStream(stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=self.__framerate)) self.__audio_config = speechsdk.AudioConfig(stream = self.__hq_stream) if len(self.__languages) == 1: self.__hq_recognizer = speechsdk.SpeechRecognizer( speech_config=self.__config, language=self.__languages[0], audio_config = self.__audio_config) else: self.__hq_recognizer = speechsdk.SpeechRecognizer( speech_config=self.__config, auto_detect_source_language_config=speechsdk.languageconfig. AutoDetectSourceLanguageConfig(languages=self.__languages), audio_config = self.__audio_config) # Recognizer used to detect the wake word self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]') # Add callbacks to azure events self.__hq_recognizer.recognized.connect(self.on_recognized) self.__hq_recognizer.session_stopped.connect(self.on_session_stopped) self.__hq_recognizer.canceled.connect(self.on_session_stopped) # Callbacks self.on_ready = lambda *x: x self.on_triggered = lambda *x: x self.on_begin_command = lambda *x: x self.on_finish_command = lambda *x: x self.on_unknown_command = lambda *x: x self.on_error = lambda *x: x # List of commands self.__commands = [] self.__alternatives = {} self.__buffer_size = 5 self.wake_buffer = RingBuffer(self.__buffer_size * self.__framerate) self.recognized = threading.Event() self.mode = WAITING_FOR_WAKEWORD # Future returned by Azure's recognize_once_async, not sure what the point is since you can just # connect callbacks self.fut = None # Number of frames missed by the local recognizer while processing commands self.missed_frames = 0 def on_session_stopped(self, evt): if self.fut: self.fut.get() self.fut = None def add_command(self, pattern, callback): self.__commands.append(CommandEntry(pattern, callback, self.__alternatives)) def add_alternatives(self, word_or_dict, alts=[]): if type(word_or_dict) == dict: self.__alternatives.update(word_or_dict) else: if word_or_dict in self.__alternatives: self.__alternatives[word_or_dict] += alts else: self.__alternatives[word_or_dict] = alts def perform_all_commands(self, cmd): while True: has_match = False for command in self.__commands: result, next_command = command.try_invoke(cmd) if result: has_match = True if next_command: cmd = next_command break else: return if not has_match: break self.on_unknown_command(cmd) def on_recognized(self, event): try: speech = event.result.text.translate(str.maketrans('', '', string.punctuation)).lower() print("Recognized: {}".format(speech)) self.perform_all_commands(speech) self.fut.get() self.fut = None self.mode = WAITING_FOR_WAKEWORD except Exception as e: print(e) def audio_callback(self, in_data, frame_count, time_info, status): if status: print(status) audio_data = np.fromstring(in_data, dtype=np.int16) if self.mode == WAITING_FOR_WAKEWORD: self.wake_buffer.extend(audio_data) if self.__lq_recognizer.AcceptWaveform(in_data): self.recognized.set() elif self.mode == RECORDING_COMMAND: self.__hq_stream.write(audio_data) self.missed_frames += frame_count return (None, pyaudio.paContinue) def reset_offline_recognizer(self): self.missed_frames = 0 self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]') def recognize_stream(self): self.start_time = time.time() while True: self.recognized.wait() self.recognized.clear() result = self.__lq_recognizer.Result() jres = json.loads(result) if not self.__active and jres["text"] == self.__wake_word: self.on_triggered() self.mode = RECORDING_COMMAND wakeword_end_time = 0 for res in jres["result"]: if res["word"] == self.__wake_word: wakeword_end_time = res["end"] lag = time.time() - self.start_time - wakeword_end_time - (self.missed_frames / self.__framerate) lag = int(round((lag) * self.__framerate)) start_data = self.wake_buffer.get(lag) self.fut = self.__hq_recognizer.recognize_once_async() missed = start_data[:lag] missed = np.resize(missed, self.__framerate) self.__hq_stream.write(missed) def start_listening(self): p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.__framerate, input=True, frames_per_buffer=1024, stream_callback=self.audio_callback) stream.start_stream() self.on_ready() self.recognize_stream()
def no_internet(i): #speak('no internet ') from vosk import Model, KaldiRecognizer import threading import datetime import time from datetime import date today = date.today() t2 = threading.Thread(target=seeting) t1 = threading.Thread(target=texteditor) t3 = threading.Thread(target=terminal) t4 = threading.Thread(target=files) t5 = threading.Thread(target=cal) def text(Text): print("hi :", Text[1]) list = Text.split('"') ste = list[-2] print("list", list[-2]) # program.set(ste) # root.update() return ste # x = threading.Thread(target=ext,args=(ste,)) # x.start() #x.join() if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) import pyaudio model = Model("model") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() print("hi baby", type(p)) program.set("leassening...") root.update() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while i == 1: #x = threading.Thread(target=text(),args=(rec.Result())) data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): voice = text(rec.Result()) if 'setting' in voice or 'seating' in voice: voice = voice.replace('open', '') try: speak('opening setting') program.set("opening setting") root.update() t2.start() except: if t2.is_alive(): speak('already opening') else: t2.join() del t2 t2 = threading.Thread(target=seeting) t2.start() elif 'text editor' in voice or 'notepad' in voice or 'edit' in voice: voice = voice.replace('open', '') try: speak('opening text editor') program.set("opening text editor") root.update() t1.start() except: if t1.is_alive(): speak('already opening') else: t1.join() del t1 t1 = threading.Thread(target=texteditor) t1.start() elif 'terminal' in voice or 'cmd' in voice or 'ter' in voice: voice = voice.replace('open', '') try: speak('opening terminal') program.set("opening terminal") root.update() t3.start() except: if t3.is_alive(): speak('already opening') else: t3.join() del t3 t3 = threading.Thread(target=terminal) t3.start() elif 'calculator' in voice or 'cal' in voice: voice = voice.replace('open', '') try: speak('opening calculator') program.set("opening calculator") root.update() t5.start() except: if t5.is_alive(): speak('already opening') else: t5.join() del t5 t5 = threading.Thread(target=cal) t5.start() elif 'files' in voice or 'file' in voice: voice = voice.replace('open', '') try: speak('opening file system') program.set("opening files") root.update() t4.start() except: if t4.is_alive(): speak('already opening') else: t4.join() del t4 t4 = threading.Thread(target=files) t4.start() elif 'exit' in voice or 'tata' in voice or 'goodbye' in voice or 'quit' in voice or 'bye bye' in voice: speak("good bye dear have a great day") exit(1) elif 'time' in voice: time = datetime.datetime.now().strftime('%I:%M %p') program.set('Current time is =' + time) root.update() speak('Current time is ' + time) elif 'date' in voice or 'day' in voice or 'debt' in voice: d2 = today.strftime("%B %d, %Y") program.set(d2) root.update() speak(d2) print(d2) else: speak('tell what can i do without internet') i = i + 1 print("Rosult", rec.Result()) #x.start() else: rec.PartialResult()
if args.filename: dump_fn = open(args.filename, "wb") else: dump_fn = None with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, dtype='int16', channels=1, callback=callback): print('#' * 80) print('Press Ctrl+C to stop the recording') print('#' * 80) rec = KaldiRecognizer(model, args.samplerate) while True: data = q.get() if rec.AcceptWaveform(data): print(rec.Result()) #else: #print(rec.PartialResult()) if dump_fn is not None: dump_fn.write(data) except KeyboardInterrupt: print('\nDone') parser.exit(0) except Exception as e: parser.exit(type(e).__name__ + ': ' + str(e))
import pyaudio from vosk import Model, KaldiRecognizer import rospy from std_msgs.msg import String p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() model = Model("model") rec = KaldiRecognizer(model, 16000) dictionary = { 'вперёд': 'forward', 'назад': 'backwards', 'налево': 'left', 'направо': 'right', 'влево': 'left', 'вправо': 'right', 'стоп': 'stop', 'разверн': 'turn around', 'иди': 'go', 'стой': 'stop' }
print('Writing %d transcripts into file %s' % (len(transcripts), outCTMFile)) with open(outJSONFile, "w") as ofp: ofp.write(json.dumps(transcripts, indent=4)) with open(outCTMFile, 'w') as ofp: for transcript in transcripts: #print('\t%s (%s-%s-%s)\n' % (transcript['transcription'], sessionId, transcript['utterance_start'], transcript['utterance_duration'])) for token in transcript["tokens"]: ofp.write("%s \t 1 \t %.2f \t %.2f \t %s\n" % (sessionId, token["start"], token["duration"], token["baseform"])) print(' ') elif args.engine == 'vosk': rec = KaldiRecognizer(VoskModel, sampleRate) rec.SetWords(True) # get the list of JSON dictionaries results = [] if (useSegmentsInVosk): for segment in tqdm(segments): del rec rec = KaldiRecognizer(VoskModel, sampleRate) rec.SetWords(True) if (len(segment.bytes) == 0): continue if (rec.AcceptWaveform(segment.bytes)): part_result = json.loads(rec.Result()) part_result['uttstart'] = segment.timestamp results.append(part_result) part_result = json.loads(rec.FinalResult())
import os import subprocess import sys import json SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) # process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i', # 'demo2.mp3', # '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'], # stdout=subprocess.PIPE) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) totalResult = [] while True: data = process.stdout.read(4000) if len(data) == 0:
SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass # (rec.Result()) else: (rec.PartialResult()) text = (rec.FinalResult()) json = (json.loads(text)) print(json["text"])
class VoskProcessor(EngineInterface): """Process chunks with Vosk""" def __init__(self, send_message, options: dict = None): """Create Vosk processor""" super().__init__(send_message) # Options if not options: options = {} # Common options - See 'EngineInterface' self._sample_rate = options.get("samplerate", float(16000)) self._language = options.get("language") if self._language: self._language = self._language.replace( "_", "-") # make sure we have xx-XX format self.language_code_short = re.split("[-]", self._language)[0].lower() else: self.language_code_short = None self._asr_model_path = options.get("model", None) self._continuous_mode = options.get("continuous", False) self._optimize_final_result = options.get("optimizeFinalResult", False) # Specific options self._alternatives = options.get("alternatives", int(1)) self._return_words = options.get("words", False) try_speaker_detection = options.get("speaker", False) self._phrase_list = options.get("phrases") # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"] # NOTE: speaker detection does not work in all configurations if try_speaker_detection: self._speaker_detection = (settings.has_speaker_detection_model and self._alternatives == 0) else: self._speaker_detection = False # Recognizer if self._asr_model_path: # Reset language because model has higher priority if self._asr_model_path in settings.asr_model_paths: model_index = settings.asr_model_paths.index( self._asr_model_path) self._language = settings.asr_model_languages[model_index] else: self._language = "" elif not self._language or self._language not in settings.asr_model_languages: self._asr_model_path = settings.asr_model_paths[0] self._language = settings.asr_model_languages[0] else: model_index = settings.asr_model_languages.index(self._language) self._asr_model_path = settings.asr_model_paths[model_index] asr_model_path = settings.asr_models_folder + self._asr_model_path # Speaker model spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[ 0] # Make sure paths exist and load models if self._asr_model_path not in settings.asr_model_paths: raise RuntimeError( "ASR model path is not defined in available paths") if not os.path.exists(asr_model_path): raise RuntimeError("ASR model path seems to be wrong") if self._speaker_detection and not os.path.exists(spk_model_path): raise RuntimeError("Speaker model path seems to be wrong") self._model = Model(asr_model_path) if self._speaker_detection: self._spk_model = SpkModel(spk_model_path) # Use phrase list? if self._phrase_list and len(self._phrase_list) > 0: self._recognizer = KaldiRecognizer( self._model, self._sample_rate, json.dumps(self._phrase_list, ensure_ascii=False)) else: self._recognizer = KaldiRecognizer(self._model, self._sample_rate) self._recognizer.SetMaxAlternatives(self._alternatives) if self._return_words: self._recognizer.SetWords(True) if self._speaker_detection: self._recognizer.SetSpkModel(self._spk_model) self._partial_result = {} self._last_partial_str = "" self._final_result = {} # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing self._state = 0 # # TODO: GPU support: check Vosk examples to find out how to enable GPU ... :-P # Example code: # from vosk import GpuInit, GpuInstantiate # GpuInit() # def thread_init(): # GpuInstantiate() # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init) async def process(self, chunk: bytes): """Feed audio chunks to recognizer""" result = None if self._state == 3: pass elif self._recognizer.AcceptWaveform(chunk): # Silence detected result = self._recognizer.Result() self._state = 2 await self._handle_final_result(result) else: # Partial results possible result = self._recognizer.PartialResult() self._state = 1 await self._handle_partial_result(result) # End? #if not self.accept_chunks: # await self._finish() async def finish_processing(self): """Wait for last process and end""" # End? await self._finish() async def close(self): """Reset recognizer and remove""" #if self._recognizer: #self._recognizer.Reset() # this throws an error!? Maye because its closed already? #self._recognizer = None def get_options(self): """Get Vosk options for active setup""" active_options = { "language": self._language, "model": self._asr_model_path, "samplerate": self._sample_rate, "optimizeFinalResult": self._optimize_final_result, "alternatives": self._alternatives, "continuous": self._continuous_mode, "words": self._return_words, "speaker": self._speaker_detection } if self._phrase_list and len(self._phrase_list) > 0: # NOTE: this can be very large, for now we use a placeholder active_options["phrases"] = [] #active_options["phrases"] = self._phrase_list else: active_options["phrases"] = [] return active_options async def _handle_partial_result(self, result): """Handle a partial result""" if result and self._last_partial_str != result: self._last_partial_str = result norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) self._partial_result = norm_result #print("PARTIAL: ", self._partial_result) await self._send(self._partial_result, False) async def _handle_final_result(self, result, skip_send=False): """Handle a final result""" if result: #print("FINAL: ", result) norm_result = VoskProcessor.normalize_result_format( result, self._alternatives, self._return_words) if self._continuous_mode: # In continous mode we send "intermediate" final results self._final_result = norm_result if not skip_send: await self._send(self._final_result, True) else: # In non-continous mode we remember one big result self._final_result = VoskProcessor.append_to_result( self._final_result, norm_result) #print("FINAL (auto): ", self._final_result) async def _finish(self): """Tell recognizer to stop and handle last result""" last_result_was_final = (self._state == 2) self._state = 3 if last_result_was_final and not self._continuous_mode: # Send final result (because we haven't done it yet) await self._send(self._final_result, True) # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset elif last_result_was_final: # We don't need to do anything but reset ... right? # self._recognizer.Reset() # TODO: we skip this to prevent ERROR if already reset pass else: # Request final result = self._recognizer.FinalResult() await self._handle_final_result(result, skip_send=True) await self._send(self._final_result, True) async def _send(self, json_result, is_final=False): """Send result""" features = {} alternatives = [] if self._return_words: features["words"] = json_result.get("words", []) if self._speaker_detection: features["speaker_vector"] = json_result.get("spk", []) if self._alternatives > 0: alternatives = json_result.get("alternatives", []) transcript = json_result.get("text", "") # Post-processing? if is_final and transcript and self._optimize_final_result: # Optimize final transcription text2num_proc = TextToNumberProcessor(self._language) dt_optimizer = DateAndTimeOptimizer(self._language) transcript = text2num_proc.process(transcript) transcript = dt_optimizer.process(transcript) await self.send_transcript(transcript=transcript, is_final=is_final, confidence=json_result.get( "confidence", -1), features=features, alternatives=alternatives) # ---- Helper functions ---- @staticmethod def normalize_result_format(result: str, alternatives=0, has_words=False): """Vosk has many different formats depending on settings Convert result into a fixed format so we can handle it better""" json_result = json.loads(result) words = None if alternatives > 0 and "alternatives" in json_result: json_result = json_result.get("alternatives", []) # handle array alternatives = None if len(json_result) > 1: alternatives = json_result[1:] if has_words: words = json_result[0].get("result") return VoskProcessor.build_normalized_result( json_result[0], alternatives, words) else: # handle object if has_words: words = json_result.get("result") return VoskProcessor.build_normalized_result( json_result, None, words) @staticmethod def build_normalized_result(json_result, alternatives=None, words=None): """Build a result object that always looks the same""" # text or partial or empty: text = json_result.get( "text", json_result.get("partial", json_result.get("final", ""))) confidence = json_result.get("confidence", -1) speaker_vec = json_result.get("spk") result = { "text": text, "confidence": confidence, "alternatives": alternatives } if words is not None: result["words"] = words if speaker_vec is not None: result["spk"] = speaker_vec return result @staticmethod def append_to_result(given_result, new_result): """Append a new result to a previous one, typically used for 'intermediate' final result text""" text = new_result.get("text") if not text: return given_result #else: # we can do more post-processing here maybe if "text" in given_result: given_result["text"] += ", " + text if "confidence" in new_result: # sloppy confidence merge (take the worst) given_result["confidence"] = min( given_result.get("confidence", -1), new_result.get("confidence", -1)) if "words" in new_result: # append words given_words = given_result.get("words", []) new_words = new_result.get("words", []) if given_words and len(given_words) and new_words and len( new_words): given_result["words"] = given_words + new_words if "spk" in new_result: # take new speaker data - NOTE: not optimal given_result["spk"] = new_result.get( "spk", given_result.get("spk", [])) return given_result else: new_result["text"] = text return new_result
def __init__(self, api_key, wake_word="computer", region="eastus", languages=None): self.__api_key = api_key self.__wake_word = wake_word self.__region = region self.__languages = languages if not self.__languages: self.__languages = ["en-US"] # Are we currently processing a command? self.__active = False # Samples per second to read from mic (single channel) self.__framerate = 44100 self.__config = speechsdk.SpeechConfig(subscription=self.__api_key, region=self.__region) # Write sounds to this stream to send it to azure's speech recognition self.__hq_stream = speechsdk.audio.PushAudioInputStream(stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=self.__framerate)) self.__audio_config = speechsdk.AudioConfig(stream = self.__hq_stream) if len(self.__languages) == 1: self.__hq_recognizer = speechsdk.SpeechRecognizer( speech_config=self.__config, language=self.__languages[0], audio_config = self.__audio_config) else: self.__hq_recognizer = speechsdk.SpeechRecognizer( speech_config=self.__config, auto_detect_source_language_config=speechsdk.languageconfig. AutoDetectSourceLanguageConfig(languages=self.__languages), audio_config = self.__audio_config) # Recognizer used to detect the wake word self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]') # Add callbacks to azure events self.__hq_recognizer.recognized.connect(self.on_recognized) self.__hq_recognizer.session_stopped.connect(self.on_session_stopped) self.__hq_recognizer.canceled.connect(self.on_session_stopped) # Callbacks self.on_ready = lambda *x: x self.on_triggered = lambda *x: x self.on_begin_command = lambda *x: x self.on_finish_command = lambda *x: x self.on_unknown_command = lambda *x: x self.on_error = lambda *x: x # List of commands self.__commands = [] self.__alternatives = {} self.__buffer_size = 5 self.wake_buffer = RingBuffer(self.__buffer_size * self.__framerate) self.recognized = threading.Event() self.mode = WAITING_FOR_WAKEWORD # Future returned by Azure's recognize_once_async, not sure what the point is since you can just # connect callbacks self.fut = None # Number of frames missed by the local recognizer while processing commands self.missed_frames = 0
def __init__(self, send_message, options: dict = None): """Create Vosk processor""" super().__init__(send_message) # Options if not options: options = {} # Common options - See 'EngineInterface' self._sample_rate = options.get("samplerate", float(16000)) self._language = options.get("language") if self._language: self._language = self._language.replace( "_", "-") # make sure we have xx-XX format self.language_code_short = re.split("[-]", self._language)[0].lower() else: self.language_code_short = None self._asr_model_path = options.get("model", None) self._continuous_mode = options.get("continuous", False) self._optimize_final_result = options.get("optimizeFinalResult", False) # Specific options self._alternatives = options.get("alternatives", int(1)) self._return_words = options.get("words", False) try_speaker_detection = options.get("speaker", False) self._phrase_list = options.get("phrases") # example: self._phrase_list = ["hallo", "kannst du mich hören", "[unk]"] # NOTE: speaker detection does not work in all configurations if try_speaker_detection: self._speaker_detection = (settings.has_speaker_detection_model and self._alternatives == 0) else: self._speaker_detection = False # Recognizer if self._asr_model_path: # Reset language because model has higher priority if self._asr_model_path in settings.asr_model_paths: model_index = settings.asr_model_paths.index( self._asr_model_path) self._language = settings.asr_model_languages[model_index] else: self._language = "" elif not self._language or self._language not in settings.asr_model_languages: self._asr_model_path = settings.asr_model_paths[0] self._language = settings.asr_model_languages[0] else: model_index = settings.asr_model_languages.index(self._language) self._asr_model_path = settings.asr_model_paths[model_index] asr_model_path = settings.asr_models_folder + self._asr_model_path # Speaker model spk_model_path = settings.speaker_models_folder + settings.speaker_model_paths[ 0] # Make sure paths exist and load models if self._asr_model_path not in settings.asr_model_paths: raise RuntimeError( "ASR model path is not defined in available paths") if not os.path.exists(asr_model_path): raise RuntimeError("ASR model path seems to be wrong") if self._speaker_detection and not os.path.exists(spk_model_path): raise RuntimeError("Speaker model path seems to be wrong") self._model = Model(asr_model_path) if self._speaker_detection: self._spk_model = SpkModel(spk_model_path) # Use phrase list? if self._phrase_list and len(self._phrase_list) > 0: self._recognizer = KaldiRecognizer( self._model, self._sample_rate, json.dumps(self._phrase_list, ensure_ascii=False)) else: self._recognizer = KaldiRecognizer(self._model, self._sample_rate) self._recognizer.SetMaxAlternatives(self._alternatives) if self._return_words: self._recognizer.SetWords(True) if self._speaker_detection: self._recognizer.SetSpkModel(self._spk_model) self._partial_result = {} self._last_partial_str = "" self._final_result = {} # states - 0: waiting for input, 1: got partial result, 2: got final result, 3: closing self._state = 0
def reset_offline_recognizer(self): self.missed_frames = 0 self.__lq_recognizer = KaldiRecognizer(Model("model"), self.__framerate,'["' + self.__wake_word + '"]')
print('Video name is ' + video_name) stream = os.popen('ffmpeg -i ' + video_name + ' ' + sys.argv[1]) output = stream.read() output if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass # print(rec.Result()) else: pass
#!/usr/bin/python3 from vosk import Model, KaldiRecognizer import sys import json import os if not os.path.exists("model"): print ("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder.") exit (1) model = Model("model") # Large vocabulary free form recognition rec = KaldiRecognizer(model, 16000) # You can also specify the possible word list #rec = KaldiRecognizer(model, 16000, "zero oh one two three four five six seven eight nine") wf = open(sys.argv[1], "rb") wf.read(44) # skip header while True: data = wf.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) print (res['text'])
from vosk import Model, KaldiRecognizer, SetLogLevel import sys import os import wave import json SetLogLevel(0) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(lang="en-us") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) break else: jres = json.loads(rec.PartialResult()) print(jres) if jres['partial'] == "one zero zero zero": print("We can reset recognizer here and start over")
def recognize(self, body): remote_file_path = body try: dialogue_id = (remote_file_path.split('/')[1]).split('.')[0] print('Dialogue id is {}'.format(dialogue_id)) local_file_path = os.path.join(self.sftp_client.download_path, remote_file_path.split('/')[1]) print(local_file_path, remote_file_path) self.sftp_client.download_file_local(local_file_path, remote_file_path) except Exception as e: print('Exception occured, wrong filename format {}'.format(remote_file_path)) print('Exception occured {}'.format(e)) exit(1) recognition_result = [] stt_recognizer = KaldiRecognizer(self.model, self.rate) if stt_recognizer is not None: try: wf = wave.open(local_file_path, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) while True: data = wf.readframes(8000) if len(data) == 0: break if stt_recognizer.AcceptWaveform(data): recognition_chunk = json.loads(stt_recognizer.Result()) if 'result' in recognition_chunk.keys(): recognition_result.append(recognition_chunk['result']) else: recognition_chunk = json.loads(stt_recognizer.PartialResult()) if 'result' in recognition_chunk.keys(): recognition_result.append(recognition_chunk['result']) print("Recognition result {}".format(json.dumps(recognition_result))) for phrase in recognition_result: for word in phrase: word['word'] = word['word'].replace("'", ' ') recognition_result = self.process_sttresult(recognition_result) # print('Result is {}'.format(json.dumps(recognition_result))) psql_client = PostgresClient() psql_client.init_app(config=self.config) psql_client.update_stt_result(result=json.dumps(recognition_result, ensure_ascii=False), dialogue_id=dialogue_id) print('Deleting local path {}'.format(local_file_path)) os.remove(local_file_path) print('Function finished, result of recognition {}'.format(recognition_result)) except Exception as e: try: psql_client = PostgresClient() psql_client.init_app(config=self.config) cur_time = datetime.utcnow() creation_time = psql_client.get_creation_time(dialogue_id=dialogue_id) if (cur_time - creation_time).total_seconds() / 3600 > 3.: psql_client.update_error_status(dialogue_id) print('Exception occured {}, recognition longs to musch period of time'.format(e)) except: exit(1) else: print('Please, init stt recognizer')
def get_recognizer(self, framerate): SetLogLevel(-1) model = Model(os.path.join(c.PLUGIN_PATH, "vosk_alternatives", "model")) rec = KaldiRecognizer(model, framerate) return rec
#Import the core lib from core import SystemInfo #Speech Synthesis engine = pyttsx3.init() def speak(text): engine.say(text) engine.runAndWait() #Speech recognition model = Model("model") rec = KaldiRecognizer(model, 16000) # Opens microphone for listening. p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(10000) if len(data) == 0: break if rec.AcceptWaveform(data):
class Recognizer(): def __init__(self): self.Threshold = 0 self.hot_word = 'пирс' self.flag = False # vosk self.model = Model("speech_model") self.rec = KaldiRecognizer(self.model, RATE) # pyaudio self.audio = pyaudio.PyAudio() self.stream = self.audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=FPB) self.stream.start_stream() # rms(rated maximum sinusoidal) noise calculation @staticmethod def rms(frame): count = len(frame) / SAMPLE_WIDTH form = "%dh" % count shorts = struct.unpack(form, frame) sum_squares = 0.0 for sample in shorts: n = sample * SHORT_NORMALIZE sum_squares += n * n rms = sqrt(sum_squares / count) return rms * 1000 # Automatically adjusts microphone level to the environment def adjustment_to_noise(self, duration=1): seconds_per_buffer = FPB / RATE end_time = 0 while True: end_time += seconds_per_buffer if end_time > duration: break data = self.stream.read(FPB) rms = self.rms(data) damping = 0.15**seconds_per_buffer target_rms = rms * 1.5 self.Threshold = Energy_speech * damping * target_rms * (1 - damping) def speech_to_text(self): self.adjustment_to_noise() task = '' now = time.time() end = time.time() + TIMEOUT_LENGTH while now <= end: data = self.stream.read(FPB) # checking the ambient volume if self.rms(data) >= self.Threshold: end = time.time() + TIMEOUT_LENGTH / 1.2 now = time.time() # vosk if self.rec.AcceptWaveform(data): text = json.loads(self.rec.Result()) task = text['text'] return task def start(self): while True: if self.flag: data = self.stream.read(FPB) if self.rec.AcceptWaveform(data): text = json.loads(self.rec.Result()) task = text['text'] if self.hot_word in task: playsound("audio/listen_to_you.mp3") return True
if not os.path.exists(spk_model_path): print( "Please download the speaker model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder." .format(spk_model_path)) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) # Large vocabulary free form recognition model = Model(model_path) spk_model = SpkModel(spk_model_path) rec = KaldiRecognizer(model, spk_model, wf.getframerate()) # We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database # to distingusih among users. spk_sig = [ 4.658117, 1.277387, 3.346158, -1.473036, -2.15727, 2.461757, 3.76756, -1.241252, 2.333765, 0.642588, -2.848165, 1.229534, 3.907015, 1.726496, -1.188692, 1.16322, -0.668811, -0.623309, 4.628018, 0.407197, 0.089955, 0.920438, 1.47237, -0.311365, -0.437051, -0.531738, -1.591781, 3.095415, 0.439524, -0.274787, 4.03165, 2.665864, 4.815553, 1.581063, 1.078242, 5.017717, -0.089395, -3.123428, 5.34038, 0.456982, 2.465727, 2.131833, 4.056272, 1.178392, -2.075712, -1.568503, 0.847139, 0.409214, 1.84727, 0.986758, 4.222116, 2.235512, 1.369377, 4.283126, 2.278125, -1.467577, -0.999971, 3.070041, 1.462214, 0.423204, 2.143578, 0.567174, -2.294655, 1.864723, 4.307356, 2.610872, -1.238721, 0.551861, 2.861954, 0.59613, -0.715396, -1.395357, 2.706177, -2.004444, 2.055255, 0.458283, 1.231968,
import subprocess import srt import json import datetime SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) WORDS_PER_LINE = 7 def transcribe(): results = [] subs = [] while True: data = process.stdout.read(4000)
import wave SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
#!/usr/bin/env python3 from vosk import Model, KaldiRecognizer import os import pyaudio model = Model('model') rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
import json SetLogLevel(0) model = Model("/home/jim/Playing/model") if sys.argv and sys.argv[0]: dir = Path(sys.argv[1]) else: exit() if not dir.is_dir(): exit() for file in dir.glob('*.mp3'): rec = KaldiRecognizer(model, 16000) rec.SetWords(True) output = [] outfile = dir / f"{file.stem}.json" print(f"{file}\n") process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', str(file), '-ar', '16000', '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data):