results.append(part_result) else: # recognize speech using vosk model in streaming mode wf = wave.open(audioFile, "rb") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): part_result = json.loads(rec.Result()) results.append(part_result) part_result = json.loads(rec.FinalResult()) results.append(part_result) wf.close() # close audiofile rec.Reset() # convert list of JSON dictionaries to list of 'Word' objects ## Write CTM lines into the output file print('Writing Vosk Decoder output to files %s and %s' % (outCTMFile, outJSONFile)) outCTMFile = '%s/%s.ctm' % (VoskOutFolder, sessionId) outJSONFile = '%s/%s.json' % (VoskOutFolder, sessionId) with open(outJSONFile, "w") as ofp: ofp.write(json.dumps(results, indent=4)) list_of_Words = [] for sentence in results: if len(sentence) == 1: # sometimes there are bugs in recognition # and it returns an empty dictionary # {'text': ''} continue
class VoskInput(AudioInput): """ Input from Vosk using the given language model. """ def __init__(self, notifier, rate=16000, wav_dir=None, model=os.path.join(_MODEL_DIR, 'model')): """ @see AudioInput.__init__() :type rate: :param rate: The override for the rate, if not the model's one. :type wav_dir: :param wav_dir: Where to save the wave files, if anywhere. :type model: :param model: The path to the Vosk model file. """ # Load in and configure the model. if not os.path.exists(model): raise IOError("Not found: %s" % (model, )) LOG.info("Loading model from %s, this could take a while", model) SetLogLevel(1 if LOG.getLogger().getEffectiveLevel() >= 20 else 2) self._model = Model(model) self._recognizer = KaldiRecognizer(self._model, rate) LOG.info("Model loaded") # Wen can now init the superclass super(VoskInput, self).__init__(notifier, format=pyaudio.paInt16, channels=1, rate=rate, wav_dir=wav_dir) # Where we put the results self._results = [] def _feed_raw(self, data): """ @see AudioInput._feed_raw() """ # Attempt to decode it if self._recognizer.AcceptWaveform(data): self._add_result(self._recognizer.Result()) def _decode(self): """ @see AudioInput._decode() """ # Collect anything remaining self._add_result(self._recognizer.FinalResult()) # Ensure it's clear for next time self._recognizer.Reset() # Tokenize tokens = [] LOG.debug("Decoding: %s" % self._results) for result in self._results: word = result.get('word', '').strip() conf = result.get('conf', 0.0) if word and conf: tokens.append(Token(word, conf, True)) # Done self._results = [] # And give them all back LOG.debug("Got: %s" % ' '.join(str(i) for i in tokens)) return tokens def _add_result(self, json_result): """ Add in any result we have from the given JSON string. """ result = json.loads(json_result) LOG.debug("Got %s" % json_result) # See what we got, if anything if 'result' in result: # A full result, which is the best self._results.extend(result['result']) elif 'text' in result: # A decoded text string for word in result['text'].split(): if word: self._results.append({'word': word, 'conf': 1.0})