def main(): vosk.SetLogLevel(-1) audio_path = sys.argv[1] out_path = sys.argv[2] model_path = 'vosk-model-small-de-0.15' sample_rate = 16000 audio, sr = librosa.load(audio_path, sr=16000) # convert to 16bit signed PCM, as expected by VOSK int16 = numpy.int16(audio * 32768).tobytes() # XXX: Model must be downloaded from https://alphacephei.com/vosk/models # https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip if not os.path.exists(model_path): raise ValueError(f"Could not find VOSK model at {model_path}") model = vosk.Model(model_path) recognizer = vosk.KaldiRecognizer(model, sample_rate) res = transcribe_words(recognizer, int16) df = pandas.DataFrame.from_records(res) df = df.sort_values('start') df.to_csv(out_path, index=False) print('Word segments saved to', out_path)
def run(command): q = queue.Queue() def callback(indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" if status: print(status, file=sys.stderr) q.put(bytes(indata)) DEVICE_NUM = None MODEL = "model" device_info = sd.query_devices(DEVICE_NUM, 'input') # SAMPLE_RATE = int(device_info['default_samplerate']) SAMPLE_RATE = 16000 model = vosk.Model(MODEL) rec = vosk.KaldiRecognizer(model, SAMPLE_RATE) try: with sd.RawInputStream(samplerate=SAMPLE_RATE, blocksize=8000, device=DEVICE_NUM, dtype='int16', channels=1, callback=callback): while True: data = q.get() if rec.AcceptWaveform(data): command.value = rec.Result() except KeyboardInterrupt: print('\nDone') exit(0) except Exception as e: exit(type(e).__name__ + ': ' + str(e))
def recordAudio(): try: device_info = sd.query_devices(None, 'input') # soundfile expects an int, sounddevice provides a float: samplerate = int(device_info['default_samplerate']) model = vosk.Model("model") dump_fn = None with sd.RawInputStream(samplerate=samplerate, blocksize=8000, device=None, dtype='int16', channels=1, callback=callback): # print('#' * 80) # print('Press Ctrl+C to stop the recording') # print('#' * 80) rec = vosk.KaldiRecognizer(model, samplerate) band = True while band: data = q.get() if rec.AcceptWaveform(data): result = rec.Result() band = False value = json.loads(result) print("Recorded text: {0}".format(value["text"])) time.sleep(2) return value["text"] else: print(rec.PartialResult()) # if dump_fn is not None: # dump_fn.write(data) except Exception as e: return "error"
def recognize_speech(wav_path, lang="en", buffer_size=4000): download_model(lang) vosk.SetLogLevel(-1) wav_file = wave.open(wav_path, "rb") recognizer = vosk.KaldiRecognizer( vosk.Model("{}/{}".format(get_model_path(), lang)), wav_file.getframerate()) words = [] for index in tqdm(range(0, wav_file.getnframes(), buffer_size)): frames = wav_file.readframes(buffer_size) if recognizer.AcceptWaveform(frames): result = json.loads(recognizer.Result()) if len(result["text"]) > 0: for token in result["result"]: words.append({ "start": token["start"], "end": token["end"], "text": token["word"], }) return words
def __init__(self, vosk_path='vosk-model-small-en-us-0.15'): print('Loading vosk...') vosk.SetLogLevel(-1) self.VOSK_PATH = vosk_path self.vosk_model = vosk.Model(self.VOSK_PATH) self.recognizer = vosk.KaldiRecognizer(self.vosk_model, 16000) print('Loaded vosk!')
def run(): try: if args.model is None: args.model = "model" if not os.path.exists(args.model): print ("Please download a model for your language from https://alphacephei.com/vosk/models") print ("and unpack as 'model' in the current folder.") parser.exit(0) if args.samplerate is None: device_info = sd.query_devices(args.device, 'input') # soundfile expects an int, sounddevice provides a float: args.samplerate = int(device_info['default_samplerate']) model = vosk.Model(args.model) if args.filename: dump_fn = open(args.filename, "wb") else: dump_fn = None with sd.RawInputStream(samplerate=args.samplerate, blocksize = 8000, device=args.device, dtype='int16', channels=1, callback=callback): print('#' * 80) print('Press Ctrl+C to stop the recording') print('#' * 80) rec = vosk.KaldiRecognizer(model, args.samplerate) while True: data = q.get() if rec.AcceptWaveform(data): print(rec.Result()) else: sentence = ast.literal_eval(rec.PartialResult())['partial'].split(' ') print(sentence) if len(sentence) < 6: if any([True if s in flagged_words else False for s in sentence]): root.configure(background='red') else: root.configure(background='black') else: if any([True if s in flagged_words else False for s in sentence[-5:]]): root.configure(background='red') else: root.configure(background='black') if dump_fn is not None: dump_fn.write(data) except KeyboardInterrupt: print('\nDone') parser.exit(0) except Exception as e: parser.exit(type(e).__name__ + ': ' + str(e))
def callback_recognize(self, req): # clear queue q.queue.clear() print("options:", len(req.options), req.options) print("language:", req.language) print("timeout:", str(req.timeout)) timeout = (req.timeout if (req.timeout != 0) else 20) language = (req.language if (req.language != '') else self.language) # check if we need to change the language model print('current language: ' + self.language) if language != self.language: print('switching language to ' + language) # VOSK python API does not implement exception! # so we need to check the path by ourselves if os.path.exists(MODELS_PATH + language): self.model = vosk.Model(MODELS_PATH + language) self.language = language else: rospy.loginfo('could not load language model for ' + language) return speech_recognizeResponse('') with sd.RawInputStream(samplerate=self.device_samplerate, blocksize=8000, device=self.device_index, dtype='int16', channels=1, callback=callback): rec = vosk.KaldiRecognizer(self.model, self.device_samplerate) t_start = time.time() should_stop = False transcript = '' while not should_stop: data = q.get() if rec.AcceptWaveform(data): result = rec.Result() # print(result) jres = json.loads(result) transcript = jres['text'] for option in req.options: if option.strip() and option in transcript: transcript = option should_stop = True else: result = rec.PartialResult() # print(result) jres = json.loads(result) for option in req.options: if option.strip() and option in jres['partial']: transcript = option should_stop = True if transcript else False should_stop = should_stop or ( (time.time() - t_start) > timeout) return speech_recognizeResponse(transcript)
def processVoskForever(pipe, fs, grammar): voice_model = vosk.Model("/home/pi/vosk-model-small-en-us-0.3") max_num_iters_without_sound = 3 buffer_size = max_num_iters_without_sound # TODO: Set process prio low. while True: recognizer = vosk.KaldiRecognizer(voice_model, fs, grammar) LOGGER.info("KaldiRecognizer created.") chunk_buffer = deque(maxlen=buffer_size) got_sound = False consecutive_without_sound = 0 pipe_has_data = True bail_early = False while pipe_has_data and not bail_early: data = pipe.recv() if len(data) == 0: pipe_has_data = False LOGGER.info("Got end of line from audio pipe.") else: # Insert new chunk into buffer. chunk_buffer.append(data) float_data = np.frombuffer(data, dtype=np.int16).astype(np.float32) rms = np.sqrt(float_data.dot(float_data) / float_data.size) LOGGER.info(f"rms: {rms}") if rms > 200.0: got_sound = True consecutive_without_sound = 0 else: consecutive_without_sound += 1 if got_sound: # If we have got some sound, start popping from the chunk buffer. # This will mean that processing is delayed a few iterations. # It also means that we won't cut any speech from the stream when # we start talking in the middle of a chunk. recognizer.AcceptWaveform(chunk_buffer.popleft()) # If we have started actually processing due to there being sound some chunk, # and we then have a number of chunks without any sounds, consider done. if got_sound and consecutive_without_sound > max_num_iters_without_sound: bail_early = True # Empty chunk buffer into recognizer. LOGGER.info("Emptying buffer.") while len(chunk_buffer) > 0: recognizer.AcceptWaveform(chunk_buffer.popleft()) # Empty pipe. LOGGER.info("Emptying leftovers in pipe.") while pipe_has_data: pipe_has_data = len(pipe.recv()) != 0 result = recognizer.FinalResult() LOGGER.info(f"Got final result from recognizer:\n{result}") pipe.send(result) LOGGER.info("Result was sent on pipe.")
def get_recognizer(model): if not Path(model).exists(): raise Exception( "Model {} doesn't exist, maybe download it from: https://alphacephei.com/vosk/models and unzip it here" .format(MODEL)) if not ENABLE_VOSK_DEBUG: vosk.SetLogLevel(-1) vosk_model = vosk.Model(model) rec = vosk.KaldiRecognizer(vosk_model, AUDIO_BITRATE) return rec
def vosk_process(self): print('Loading vosk...') vosk.SetLogLevel(-1) int16 = np.int16(self.audioData * 32768).tobytes() vosk_path = self.VOSK_PATH vosk_model = vosk.Model(vosk_path) recognizer = vosk.KaldiRecognizer(vosk_model, 16000) print('Transcribing...') res = self.transcribe_words(recognizer, int16) df = pd.DataFrame.from_records(res) df = df.sort_values('start') print('Completed transcribe') self.df = df
def __init__(self, vosk_model_path, wakeword_detector, nlu_dataset, client, samplerate=16000): self.stt = vosk.KaldiRecognizer(vosk.Model(vosk_model_path), samplerate) self.client = client self.samplerate = samplerate self.listener = Listener(samplerate, self.on_noise) self.wakeword_detector = wakeword_detector self.nlu_engine = SnipsNLUEngine(config=CONFIG_FR) self.nlu_engine.fit(json.load(open(nlu_dataset)))
def __init__(self): self.q = queue.Queue() self.device = None try: model = "tools/model" #setting model location if not os.path.exists(model): print ("Please download a model for your language from https://alphacephei.com/vosk/models") print ("and unpack as 'model' in the tools folder.") exit(0) device_info = sd.query_devices(self.device, 'input') # soundfile expects an int, sound device provides a float: self.samplerate = int(device_info['default_samplerate']) model = vosk.Model(model) self.rec = vosk.KaldiRecognizer(model, self.samplerate) except Exception as e: print("EXCEPTION : {}".format(e)) exit(0)
def main(t: transport.Transport) -> None: """Starts speech recognition.""" import sounddevice, vosk, locale, queue, os.path, json # We select the appropriate model from the list of downloaded ones according to the language used # in the system. If there is no suitable model, we take the first one that comes across. # # You can specify a specific model by entering its name below instead of 'searched_folders[0]'. lang, _ = locale.getdefaultlocale() guess = '-' + lang[:2] + '-' searched_folders = [f for f in list_subdirs('models') if guess in f] if not searched_folders: searched_folders = list_subdirs('models') selected_model = searched_folders[0] print(f'Selected "{selected_model}".') vosk_model = vosk.Model(os.path.join('models', selected_model)) audio_block_queue = queue.Queue() def checkout(indata, frames, time, status): """Writes recorded audio to queue that handled below.""" if status: print(status, file=sys.stderr) audio_block_queue.put(bytes(indata)) # Usually personal computers and laptops are equipped with a maximum of one microphone, so # if there are microphones at all, we will choose the first one that comes across. # # If you have more than one microphone, you can specify which microphone to use by assigning # its name to 'device' kwarg. with sounddevice.RawInputStream(blocksize=8000, dtype='int16', channels=1, callback=checkout): sample_rate = int(sounddevice.query_devices(sounddevice.default.device, "input")["default_samplerate"]) vosk_recognizer = vosk.KaldiRecognizer(vosk_model, sample_rate) print('Let\'s start recognizing...') try: while True: if SHUTDOWN: break data = audio_block_queue.get() if vosk_recognizer.AcceptWaveform(data): text = json.loads(vosk_recognizer.Result())["text"] for word in text.split(): if word in ATTENTION_WORDS: print('- ' + text) break except KeyboardInterrupt: print('\nSpeech recognition is off.')
def __init__(self, prefix, language): self.prefix = prefix self.language = language # find respeaker mic self.device_index = self.get_respeaker_device_index() if not self.device_index: rospy.logfatal("could not find Respeack microphone device") raise Exception('device') # open mic audio device device_info = sd.query_devices(self.device_index, 'input') # soundfile expects an int, sounddevice provides a float: self.device_samplerate = int(device_info['default_samplerate']) self.model = vosk.Model(MODELS_PATH + self.language) # start recognize service self.speech_recognize = rospy.Service(prefix + '/recognize', speech_recognize, self.callback_recognize)
def __init__(self, flamingo_tools, model_path=f"Assistant_Brain/models/sr_model"): self.flamingo_tools = flamingo_tools self.q = queue.Queue() self.model_path = model_path self.interpreter = InterpretSpeech() self.device = None self.rec = None self.device_info = sd.query_devices(self.device, 'input') # soundfile expects an int, sounddevice provides a float: self.sample_rate = int(self.device_info['default_samplerate']) # Deactivate sound current_path = str(Path.cwd().parent) self.deactivate_file = "deactivate.m4a" self.deactivate_sound = f"{current_path}/Flamingo/files/audio/assistant_sfx/{self.deactivate_file}" # Location of downloaded model from setup_speech_recognition self.model = vosk.Model(self.model_path) # Time to listen for command (seconds) self.listen_time = 10
def __init__(self): self._ignore_stderr() def audio_callback(in_data, frame_count, time_info, status): self.ring_buffer.extend(in_data) play_data = chr(0) * len(in_data) return play_data, pyaudio.paContinue vosk.SetLogLevel(-1) sample_rate = 16000 self.recognizer = vosk.KaldiRecognizer(vosk.Model(VOSK_MODEL), sample_rate) self.ring_buffer = RingBuffer() self.audio = pyaudio.PyAudio() self.stream_in = self.audio.open(input=True, output=False, format=pyaudio.paInt16, channels=1, rate=sample_rate, frames_per_buffer=2048, stream_callback=audio_callback)
if status: print(status, file=sys.stderr) q.put(bytes(indata)) try: if not os.path.exists("model"): print ("Please download a model for your language from https://alphacephei.com/vosk/models") print ("and unpack as 'model' in the current folder.") parser.exit(0) device_info = sd.query_devices(None, 'input') # soundfile expects an int, sounddevice provides a float: samplerate = int(device_info['default_samplerate']) model = vosk.Model("model") dump_fn = None with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=None, dtype='int16', channels=1, callback=callback): print('#' * 80) print('Press Ctrl+C to stop the recording') print('#' * 80) rec = vosk.KaldiRecognizer(model, samplerate) while True: data = q.get() if rec.AcceptWaveform(data): value = json.loads(rec.Result())
def run(res): parser = argparse.ArgumentParser(add_help=False) parser.add_argument('-l', '--list-devices', action='store_true', help='show list of audio devices and exit') args, remaining = parser.parse_known_args() if args.list_devices: print(sd.query_devices()) parser.exit(0) parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[parser]) parser.add_argument('-f', '--filename', type=str, metavar='FILENAME', help='audio file to store recording to') parser.add_argument('-m', '--model', type=str, metavar='MODEL_PATH', help='Path to the model') parser.add_argument('-d', '--device', type=int_or_str, help='input device (numeric ID or substring)') parser.add_argument('-r', '--samplerate', type=int, help='sampling rate') args = parser.parse_args(remaining) if res == 'fr': models = 'model_fr' elif res == 'en': models = 'model_en' else: print('la langue entrée n\'est pas prise en contre par le système') models = None if models is not None: try: if args.model is None: args.model = models if not os.path.exists(args.model): print( "Please download a model for your language from https://alphacephei.com/vosk/models" ) print("and unpack as {} in the current folder.".format(models)) parser.exit(0) if args.samplerate is None: device_info = sd.query_devices(args.device, 'input') # soundfile expects an int, sounddevice provides a float: args.samplerate = int(device_info['default_samplerate']) model = vosk.Model(args.model) if args.filename: dump_fn = open(args.filename, "wb") else: dump_fn = None with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, dtype='int16', channels=1, callback=callback): print("Pour arreter l'enregistrement, Appuyer sur 'Ctrl+c' ") rec = vosk.KaldiRecognizer(model, args.samplerate) capText = True if res == 'fr': print("Je suis à l'écoute ...") elif res == 'en': print('Im leasning ...') while capText: data = q.get() if rec.AcceptWaveform(data): result = json.loads(rec.Result()) if result['text'] != '': capText = False searchText = result['text'] print(searchText) def index_phrase_fr(): with open('fr.txt', 'r', encoding='utf-8') as f: tab = f.readlines() for text in tab: if searchText in text: read_text = text index = tab.index(read_text) engine.setProperty("voice", voices[1].id) with open('en.txt', 'r', encoding='utf-8') as f: toto = f.readlines()[index:] for line in toto: titi = line engine.say(titi) engine.runAndWait() def index_phrase_en(): with open('en.txt', 'r', encoding='utf-8') as f: tab = f.readlines() for text in tab: if searchText in text: read_text = text index = tab.index(read_text) with open('fr.txt', 'r', encoding='utf-8') as f: toto = f.readlines()[index:] for line in toto: titi = line engine.say(titi) engine.runAndWait() if res == 'fr': index_phrase_fr() elif res == 'en': index_phrase_en() # else: capText = True if dump_fn is not None: dump_fn.write(data) except KeyboardInterrupt: print('\nDone') parser.exit(0) except Exception as e: parser.exit(type(e).__name__ + ': ' + str(e)) else: print( 'Les langues prises en contre par le système sont : français et anglais' )
'-r', '--samplerate', type=int, help='sampling rate') args = parser.parse_args(remaining) try: if args.model is None: args.model = "model" if not os.path.exists(args.model): print ("Please download a model for your language from https://alphacephei.com/vosk/models") print ("and unpack as 'model' in the current folder.") parser.exit(0) if args.samplerate is None: device_info = sd.query_devices(args.device, 'input') # soundfile expects an int, sounddevice provides a float: args.samplerate = int(device_info['default_samplerate']) model = vosk.Model(args.model) if args.filename: dump_fn = open(args.filename, "wb") else: dump_fn = None with sd.RawInputStream(samplerate=args.samplerate, blocksize = 16000, device=args.device, dtype='int16', channels=1, callback=callback): print('#' * 80) print('Press Ctrl+C to stop the recording') print('#' * 80) rec = vosk.KaldiRecognizer(model, args.samplerate) while True: data = q.get()
from pydub import AudioSegment from pydub.playback import play import datetime import json import traceback from collections import deque import editdistance from time import sleep import torch from transformers import GPT2TokenizerFast, GPT2LMHeadModel # Build models current_dir = os.getcwd() vosk_model = vosk.Model(os.path.join(current_dir, "checkpoints", "vosk-model")) gpt2_model = GPT2LMHeadModel.from_pretrained("checkpoints/v4/") tokenizer = GPT2TokenizerFast.from_pretrained("antoiloui/belgpt2", model_max_length=768, pad_token='<|pad|>') # put gpt2 on gpu device = torch.device('cuda') gpt2_model.cuda() # put gpt2 in eval mode gpt2_model.eval() # import tacotron stuff tacotron_dir = "Multilingual_Text_to_Speech" tacotron_chpt = "generated_switching.pyt"
q.put(bytes(indata)) if __name__ == "__main__": env = get_env() try: if not os.path.exists(env.model_path): logging.error(f'{env.model_path=} not found') sys.exit("Acoustic-Language Model was not found") if env.sample_rate is None: device_info = sd.query_devices(kind='input') env.samplerate = int(device_info['default_samplerate']) model = vosk.Model(env.model_path) with sd.RawInputStream(samplerate=env.sample_rate, blocksize=4000, dtype='int16', channels=1, callback=callback): logging.info( '------------------ Press Ctrl+C to stop the recording ------------------' ) recognizer = vosk.KaldiRecognizer(model, env.sample_rate) while True: data = q.get() if recognizer.AcceptWaveform(data): logging.info(recognizer.Result())
tokenData = prepareData(config['tokens']) patternData = prepareData(config['patterns']) if config['prefixOther'] is None: print('Missing prefixOther property in %s' % configFile) exit(1) if config['prefixMatch'] is None: print('Missing prefixMatch property in %s' % configFile) exit(1) if samplerate is None: device_info = sd.query_devices(device, 'input') # soundfile expects an int, sounddevice provides a float: samplerate = int(device_info['default_samplerate']) model = vosk.Model(config['vosk_model_path']) with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype='int16', channels=1, callback=callback): rec = vosk.KaldiRecognizer(model, samplerate) while True: data = q.get() if rec.AcceptWaveform(data): res = rec.Result() obj = json.loads(res) text = obj['text'] if len(text) > 0: patterns = tryToParsePatterns(text, tokenData, patternData) if patterns is None: print('%s%s' % (config['prefixOther'], text)) else:
channels=channels, rate=fs, frames_per_buffer=chunk, stream_callback=audio_queue.addFramesToVector, # input_device_index=6, # 2 = antlion zero, 5 = laptop, 6 = antlion laptop, empty = system default. input=True, start=False) valid_commands = [ "turn on turtle", "turn off turtle", "turn on green", "turn off green", "turn on blue", "turn off blue", "turn on corner", "turn off corner", "engage party mode", "let there be light", "you all suck", "good night" ] grammar = getGrammar(valid_commands) print(grammar) voice_model = vosk.Model("vosk-model-small-en-us-0.3") recognizer = vosk.KaldiRecognizer(voice_model, fs, grammar) current_frame = 0 while True: input("Press ENTER to start recording. Ctrl-C to stop recording.") audio_queue.clear() stream.start_stream() print("Recording...") try: while True: time.sleep(0.5) #audio_data = stream.read(chunk) #audio_data = audio_queue.getNextChunk() audio_data = None
def __init__(self, model_path, samplerate=16000, identifier=None): super(Vosk, self).__init__(model_path, identifier) vosk.SetLogLevel(0) self.model = vosk.Model(self.model_path) self.rec = vosk.KaldiRecognizer(self.model, samplerate) self.samplerate_hz = samplerate