def translate_file(filename="last5.wav"): SetLogLevel(-1) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, 16000) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = rec.FinalResult() #print(rec.FinalResult()) #else: #print(rec.PartialResult()) try: #for some reason res doesnt get assigned post loop results = res #print("results: " +results) except UnboundLocalError: results = rec.FinalResult( ) #rec.FinalResult() holds the words in this case results_json = json.loads(results) #print(results_json["text"]) return (results_json["text"]) #["results"] for confidence of each word
def speech_to_text(args): if not os.path.exists(os.path.join('models', args.model)): print( "Please download the model from https://alphacephei.com/vosk/models and unpack to 'models' folder.") exit(1) for filepath in glob.iglob(os.path.join(os.getcwd(), args.data, '*.wav')): print(filepath) wf = wave.open(args.data, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(args.model) rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult()) hypothesis_path = os.path.join(args.hypothesis, filepath.split('.')[0] + '.txt') with open(hypothesis_path, 'w') as hypothesis: hypothesis.write(rec.FinalResult())
def wav2str(filename, sample_rate=16000, foldername="voskmodel"): # this is the name of the model folder model = Model(foldername) rec = KaldiRecognizer(model, sample_rate) wf = wave.open(filename, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) results = [] subs = [] while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): results.append(rec.Result()) results.append(rec.FinalResult()) Strings = [] for i, res in enumerate(results): jres = json.loads(res) if not 'result' in jres: continue words = jres['result'] for j in range(len(words)): Strings.append(words[j]['word']) return Strings
def upload_voice_input(request): if request.method == "POST": myFile = request.FILES.get("myfile", None) if not myFile: print("no files for upload!") return HttpResponse("no files for upload!") destination = open(os.path.join("media/voice", myFile.name), 'wb+') for chunk in myFile.chunks(): destination.write(chunk) destination.close() rec = KaldiRecognizer(vosk_model, 16000) wf = wave.open(BASE_DIR + '/media/voice/voicehome.wav', "rb") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): rec.Result() data = json.loads(rec.FinalResult()) voicetext = data['text'] print(voicetext) selectitem = dragon_cf['voicerec'][voice_section] item_value_array = selectitem.split(',') if voicetext not in item_value_array: newvalue = selectitem + ',' + voicetext dragon_cf.set('voicerec', voice_section, newvalue) dragon_cf.write(open(voice_rec_config, 'w')) return HttpResponse("voice added added added") return HttpResponse("voice already exist")
async def processVoice(waveChunk, recognizer: KaldiRecognizer): """ Recognize audio chunk and process with terminal.onText() """ signature = None text = '' final = False try: final = recognizer.AcceptWaveform(waveChunk) if final: # Фраза распознана полностью j = json.loads(recognizer.FinalResult()) # Получить распознанный текст text = str(j['text']).strip() if 'text' in j else '' else: # Получить распознанный текст j = json.loads(recognizer.PartialResult()) text = str(j['partial']).strip() if 'partial' in j else '' # Попытаться извлечь сигнатуру голоса: signature = j["spk"] if 'spk' in j else [] except KeyboardInterrupt as e: onCtrlC() raise e except Exception as e: logError(f'Exception processing phrase chunk : {e}') return (final, text, signature)
def recognize(self): if not os.path.exists("Speech_Recognition/model"): print( "Please create speech model as 'model' in the current folder.") exit(1) sound = AudioSegment.from_wav(self.file_folder) sound = sound.set_channels(1) sound.export("path.wav", format="wav") wf = wave.open('path.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("Speech_Recognition/model") rec = KaldiRecognizer(model, wf.getframerate()) result = '' while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): x = json.loads(rec.Result()) result += x['text'] + ' ' else: pass result += json.loads(rec.FinalResult())['text'] return result
def translate_file(filename="last5.wav"): SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) results = rec.FinalResult() return json.loads(results)[ "text"] #["results"] for confidence of each word
def speechtotext(string): if not os.path.exists("model-en"): print ("Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model-en' in the current folder.") exit (1) wf = wave.open(string, "rb") # print(wf) if wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print ("Audio file must be WAV format mono PCM.") exit (1) model = Model("model-en") rec = KaldiRecognizer(model, wf.getframerate()) text="" while True: data = wf.readframes(100000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) text = text + res[ 'text'] # else: # res = json.loads(rec.PartialResult() res = json.loads(rec.FinalResult()) text = text +res[ 'text' ] return text
def ShiBie_ZiRanYuYan(): wf = wave.open('yuyin.wav', "rb") model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) wenben = "" while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) wenben = res['text'] #print("识别结果:",wenben) else: pass #if '"text" : "' in rec.PartialResult(): #wenben = rec.PartialResult() #print("部分识别结果:",rec.PartialResult()) if wenben == "": res = json.loads(rec.FinalResult()) wenben = res['text'] # n = wenben.find('"text" : "') # wenben = wenben[n+10:].strip('}""') del_zf = ' "\n' for c in wenben: if c in del_zf: wenben = wenben.replace(c, '') return wenben
def ShiBie_ZiFu(): if PanDuan == "": ZiFuJi = "继 续 检 搜 索 全 部 无 损 听 歌 播 放 音 乐 停 止 诗 词 单 曲 专 辑 循 环 顺 序 随 相 声 评 书 讲 坛 朗 读 关 机 复 制 上 下 一 个 从 头 添 加 收 藏 中 文 日 语 英 更 新 升 级 清 空 谁 多 少 什 么 唱 名 叫 他 的" else: ZiFuJi = "对 是 嗯 没 错" wenben = "" model = Model("model") rec = KaldiRecognizer(model, 16000, ZiFuJi) WaveWenJian = open("yuyin.wav", "rb") WaveWenJian.read(44) while True: data = WaveWenJian.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) wenben = res['text'] print("识别结果是: " + res['text']) res = json.loads(rec.FinalResult()) if wenben == "": wenben = res['text'] print("最终结果是: " + wenben) return wenben
def recognition(): if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model") # You can also specify the possible word or phrase list as JSON list, the order doesn't have to be strict rec = KaldiRecognizer( model, wf.getframerate(), '["oh one two three four five six seven eight nine zero", "[unk]"]') while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def recognizer_process(queue_audio, queue_text): """ as result: place into queue_text <- (text, True|False) where: text - a str with recognizer result, to json.loads() """ print('Worker started') rec = KaldiRecognizer(model, 8000) last_received = datetime.datetime.now() partial = True while True: queue_bytes = b'' while not queue_audio.empty(): last_received = datetime.datetime.now() queue_bytes += queue_audio.get() if rec.AcceptWaveform(queue_bytes): res = rec.Result() partial = False queue_text.put(res) if datetime.datetime.now() - datetime.timedelta( seconds=60) > last_received: if partial: queue_text.put(rec.FinalResult()) print(f'Worker stopped ') time.sleep(1) return time.sleep(1)
def _get_data_in_audio(self,audio_wav_path: str): """ :param audio_wav_path: -path to wav :return: """ wf = wave.open(audio_wav_path, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": # check file not bead return # список для объединения результатов result = list() # wf.getframerate()->Возвращает частоту дискретизации. rec = KaldiRecognizer(self.model, wf.getframerate()) while True: data = wf.readframes(1000) if len(data) == 0: break if rec.AcceptWaveform(data): # get result in JSON data = rec.Result() jsonData = json.loads(data) result.append(jsonData['text']) jsonData = json.loads(rec.FinalResult()) # data is void if 'result' in jsonData: result.append(jsonData.get('text')) wf.close() self.raw_data = result return result
def Recognize(self, request, context): recognizer = KaldiRecognizer(self.model, vosk_sample_rate) recognizer.AcceptWaveform(self.mp3ToWav(request.audio_content)) finalResult = recognizer.FinalResult() print(finalResult) return self.get_response(finalResult)
def process_file(self, file_name): """ Run the Vosk model on the input file :param file_name: Input wav or mp3 file :return: List of dictionaries containing: confidence, start time, end time and the predicted word """ logger.info(f'Recognising speech for {file_name}') wf = wave.open(file_name, "rb") # Check to see if the audio file can be read by the Vosk model if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": raise Exception(f'Invalid file format for {file_name}') rec = KaldiRecognizer(self.model, wf.getframerate()) results = [] while True: data = wf.readframes(config.frame_to_read) # If the data we have read is empty then we are at the end of the file if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) # Result can contain an empty text string but no result list if len(result['text']) > 0: # If we reach here we have accepted the translation of a section of text results.extend(result['result']) result = json.loads(rec.FinalResult()) # Add to results list if len(result['text']) > 0: results.extend(result['result']) logger.info(f'Processed speech, captured {len(results)} results') return results
class WakeWordDetector: """唤醒词检测器,对 `vosk-api <https://github.com/alphacep/vosk-api>`_ 的简单封装,默认的唤醒词是 `'阿Q'` 和 `'R-Cute'`。 如果要自定义唤醒词,请参考 https://github.com/alphacep/vosk-api/blob/master/python/example/test_words.py """ def __init__( self, sr=16000, lang='en', grammar='[ "a b c d e f g h i j k l m n o p q r s t u v w x y z key cute", "[unk]" ]' ): self.load(lang) self._det = KaldiRecognizer(util.cache[f'vosk.{lang}'], sr, grammar) def _detected(self, text): if text == 'r q': return '阿Q' elif text == 'r cute': return 'R-Cute' def load(self, lang='en'): """load language model in advance""" model = util.cache.get(f'vosk.{lang}', Model(util.data_file(f'vosk/{lang}'))) util.cache[f'vosk.{lang}'] = model def detect(self, source, timeout=None): """开始检测 :param source: 声音来源 :param timeout: 超时,即检测的最长时间(秒),默认为 `None` ,表示不设置超时,知道检测到唤醒词才返回 :type timeout: float, optional :return: 检测到的唤醒词模型对应的唤醒词,若超时没检测到唤醒词则返回 `None` :rtype: str """ self._cancel = False # possible race condition? if timeout: count = 0.0 self._det.FinalResult() # clear buffer while True: segment = source.read() if self._det.AcceptWaveform(segment.raw_data): p = self._detected(json.loads(self._det.Result())['text']) else: p = self._detected( json.loads(self._det.PartialResult())['partial']) if p: return p if self._cancel: return # raise RuntimeError('Hotword detection cancelled by another thread') elif timeout: count += segment.duration_seconds if count > timeout: return # self._detected(self._det.FinalResult()['text']) def cancel(self): """停止检测""" self._cancel = True
def stt(fp, buffer_size=8192) -> str: kaldi = KaldiRecognizer(kaldi_model, 16000) buf = bytearray(buffer_size) im_ok = False while fp.readinto(buf): kaldi.AcceptWaveform(buf) im_ok = True return json.loads(kaldi.FinalResult())['text'] if im_ok else ''
def speech(request): result = { 'имя': None, 'фамилия': None, 'отчество': None, } questions = { 'первый': None, } answer = None text = None if request.method == "POST": form = NameForm(request.POST, files=request.FILES) if form.is_valid(): wf = wave.open(form.cleaned_data['file'], mode="rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(5000) if len(data) == 0: break rec.AcceptWaveform(data) text = json.loads(rec.FinalResult())['text'] res_list = text.split() for word in result.keys(): if word in res_list: result[word] = res_list[res_list.index(word) + 1] for word in questions.keys(): if word in res_list: questions[word] = res_list[res_list.index(word) + 2] if questions['первый'] == 'да': answer = 2 if questions['первый'] == 'нет': answer = 1 form = NameForm({ 'last_name': result['фамилия'], 'first_name': result['имя'], 'middle_name': result['отчество'], 'choice': answer }) else: form = NameForm() return render(request, 'speech/speech.html', context={ 'text': text, 'form': form })
def next_sentence(self, process): reconizer = KaldiRecognizer(self.vosk_model, self.sample_rate) while True: data = process.stdout.read(8000) if len(data) == 0: break if reconizer.AcceptWaveform(data): yield self.format_result(reconizer.Result()) yield self.format_result(reconizer.FinalResult(), final=True)
def main(): argv = sys.argv[1:] model_path = "./model" filename = "" try: opts, _ = getopt.getopt(argv, "f:m:", ["file_name =", "model_path ="]) #print(opts) #print(args) except: print("Error with arguments") return for opt, arg in opts: if opt in ['-f', '--file_name']: filename = arg elif opt in ['-m', '--model_path']: model_path = arg print("FILE: ", filename, " MODEL: ", model_path) if not os.path.exists(model_path): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) return SetLogLevel(-1) sample_rate = 16000 model = Model(model_path) rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', filename, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) result = "" while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): data = json.loads(rec.Result()) result += data['text'] #print(result) data = json.loads(rec.FinalResult()) result += data['text'] print("\n") print(result)
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer(self.model, request.config.specification.sample_rate_hertz) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
def reconize(model_path, process): vosk_model = Model(model_path) reconizer = KaldiRecognizer(vosk_model, sample_rate) reconizer.SetWords(True) while True: data = process.stdout.read(8000) if len(data) == 0: break if reconizer.AcceptWaveform(data): yield format_result(reconizer.Result()) yield format_result(reconizer.FinalResult())
def creat_text_gpu(path): wf = wave.open(path.replace('.wav', '_mono.wav'), "rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): pass else: rec.PartialResult() write_file(parse_json(rec.FinalResult()), path.split('/')[-1].replace('.wav', ''))
def post(self): # global conversation voice_data = self.get_argument('voice') tmpfile = utils.write_temp_file(base64.b64decode(voice_data), '.mp3', '/home/asrdatabases') fname, _ = os.path.splitext(tmpfile) nfile = fname + '-16k.wav' # downsampling soxCall = 'sox ' + tmpfile + \ ' ' + nfile + ' rate 16k' subprocess.call([soxCall], shell=True, close_fds=True) utils.check_and_delete(tmpfile) wf = wave.open(nfile, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate( ) != 16000: # print ("Audio file must be WAV format mono PCM.") # exit (1) res = { "code": 1, "err_msg": "Audio file must be WAV format mono PCM." } self.write(json.dumps(res)) else: model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): # print(rec.Result()) pass else: # print(rec.PartialResult()) pass res_json = rec.FinalResult() res_dict = json.loads(res_json) text = res_dict.get('text', -1) text = ''.join(text.split()) if len(text) < 3: res = {"code": 1, "result": "Invalid audio Please Try again."} self.write(json.dumps(res)) else: res = {"code": 0, "result": text} self.write(json.dumps(res)) self.finish()
def video2data(self, url): """Получаем распознанный текст ролика по его url """ current_dir = os.getcwd() os.chdir(self.path) ydl_opts = { 'format': 'bestaudio/best', 'writeinfojson': 'info', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'progress_hooks': [self._catch_filename], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) time.sleep(20) video_description = self._downloaded_data() model = Model(self.kaldi_path) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', os.path.join(self.path, self.filename), '-ar', str(16_000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) full_text = '' while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) full_text += ' ' + res['text'] full_text += ' ' + json.loads(rec.FinalResult())['text'] os.remove(os.path.join(self.path, self.description_file)) os.remove(os.path.join(self.path, self.filename)) os.chdir(current_dir) return full_text, video_description
def listen(wf): model = Model('model') rec = KaldiRecognizer(model, wf.getframerate()) def g(): while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): yield json.loads(rec.Result()) f = lambda: json.loads(rec.FinalResult()) return (g(), f)
def recognize(line): uid, fn = line.split() wf = wave.open(fn, "rb") rec = KaldiRecognizer(model, wf.getframerate()) text = "" while True: data = wf.readframes(1000) if len(data) == 0: break if rec.AcceptWaveform(data): jres = json.loads(rec.Result()) text = text + " " + jres['text'] jres = json.loads(rec.FinalResult()) text = text + " " + jres['text'] return (uid + text)
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer( self.model, request.config.specification.sample_rate_hertz) recognizer.SetMaxAlternatives( request.config.specification.max_alternatives) recognizer.SetWords( request.config.specification.enable_word_time_offsets) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
def speech_recog(fileIn): datalist = [] SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) sample_rate = 16000 model = Model("model") rec = KaldiRecognizer(model, sample_rate) try: process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', fileIn, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) except IndexError: raise while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = rec.Result() datalist.append(json.loads(result)) finalResult = rec.FinalResult() datalist.append(json.loads(finalResult)) print(fileIn) for entry in datalist: if "result" in entry: for word in entry["result"]: word.update({"file": fileIn}) words = words_from_list(datalist) with open(os.path.splitext(fileIn)[0] + ".json", "w") as output_json: output_json.write(json.dumps(datalist)) return words
def transcribe_vosk_filename(filepath, model): model = Model(model) wf = wave.open(filepath, "rb") rec = KaldiRecognizer(model, wf.getframerate()) if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) rec.AcceptWaveform(wf.readframes(10**8)) result = json.loads(rec.FinalResult()) stamps = [x["start"] for x in result["result"]] words = [x["word"] for x in result["result"]] return words, stamps