def create_recogniser(basedir): global recogniser recogniser = PyOnlineLatgenRecogniser() argv = [ '--config=%s/models/mfcc.conf' % basedir, '--verbose=0', '--max-mem=10000000000', '--lat-lm-scale=15', '--beam=12.0', '--lattice-beam=6.0', '--max-active=5000', '%s/models/tri2b_bmmi.mdl' % basedir, '%s/models/HCLG_tri2b_bmmi.fst' % basedir, '1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25', '%s//models/tri2b_bmmi.mat' % basedir ] recogniser.setup(argv)
def create_asr(): import config from kaldi.utils import lattice_to_nbest, wst2dict from kaldi.decoders import PyOnlineLatgenRecogniser from asr_utils import lattice_calibration recogniser = PyOnlineLatgenRecogniser() recogniser.setup(config.kaldi_config) dictionary = wst2dict(config.wst_path) path_to_text = PathToText(dictionary) to_nbest = ToNBest(path_to_text, lattice_to_nbest, lattice_calibration) to_best_path = ToBestPath(path_to_text) return ASR(recogniser, to_nbest, to_best_path)
class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase): def setUp(self): self.d = PyOnlineLatgenRecogniser() def test_setup(self, args=['bad args']): self.assertFalse(self.d.setup(args)) def test_decode(self, max_frames=10): self.assertEqual(self.d.decode(max_frames), 0) def test_frame_in(self): wav = b"ahoj" # 16 bit audio -> 2 samples self.d.frame_in(wav) def test_frame_in_assert(self): wav = b"cau" # 16 bit audio ->1.5 samples == bad self.assertRaises(AssertionError, lambda: self.d.frame_in(wav)) def get_best_path(self): self.assertEqual(self.d.get_best_path(), []) def get_Nbest(self): self.assertEqual(self.d.get_Nbest(), []) def get_lattice(self): self.assertEqual(self.d.get_lattice(), None) def reset(self, keep_buffer_data=False): self.d.reset(keep_buffer_data)
def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None): wst = wst2dict(wst_path) d = PyOnlineLatgenRecogniser() d.setup(argv) for wav_name, wav_path in wav_paths: sw, sr = 2, 16000 # 16-bit audio so 1 sample_width = 2 chars pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr) print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)) lat, lik, decoded_frames = decode(d, pcm) lat.isyms = lat.osyms = fst.read_symbols_text(wst_path) if DEBUG: with open('pykaldi_%s.svg' % wav_name, 'w') as f: f.write(lat._repr_svg_()) lat.write('%s_pykaldi.fst' % wav_name) print( "Log-likelihood per frame for utterance %s is %f over %d frames" % (wav_name, int(lik / decoded_frames), decoded_frames)) word_ids = lattice_to_nbest(lat, n=10) write_decoded(file_output, wav_name, word_ids, wst)
def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None): wst = wst2dict(wst_path) d = PyOnlineLatgenRecogniser() d.setup(argv) for wav_name, wav_path in wav_paths: sw, sr = 2, 16000 # 16-bit audio so 1 sample_width = 2 chars pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr) print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr) lat, lik, decoded_frames = decode(d, pcm) lat.isyms = lat.osyms = fst.read_symbols_text(wst_path) if DEBUG: with open('pykaldi_%s.svg' % wav_name, 'w') as f: f.write(lat._repr_svg_()) lat.write('%s_pykaldi.fst' % wav_name) print "Log-likelihood per frame for utterance %s is %f over %d frames" % ( wav_name, (lik / decoded_frames), decoded_frames) word_ids = lattice_to_nbest(lat, n=10) write_decoded(file_output, wav_name, word_ids, wst)
class KaldiASR(ASRInterface): """ Wraps Kaldi PyOnlineLatgenRecogniser, which firstly decodes in forward direction and generate on demand lattice by traversing pruned decoding graph backwards. """ def __init__(self, cfg): """ Create KaldiASR instance and sets it according configuration Args: cfg(dict): Alex configuration """ super(KaldiASR, self).__init__(cfg) kcfg = self.cfg['ASR']['Kaldi'] if os.path.isfile(kcfg['silent_phones']): # replace the path of the file with its content with open(kcfg['silent_phones'], 'r') as r: kcfg['silent_phones'] = r.read() self.wst = kaldi.utils.wst2dict(kcfg['wst']) self.max_dec_frames = kcfg['max_dec_frames'] self.n_best = kcfg['n_best'] if not 'matrix' in kcfg: kcfg['matrix'] = '' # some models e.g. tri2a does not use matrix # specify all other options in config argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s " "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg) argv = argv.split() with open(kcfg['config']) as r: conf_opt = r.read() self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt)) self.calibration_table = kcfg[ 'calibration_table'] if 'calibration_table' in kcfg else None self.last_lattice = None self.decoder = PyOnlineLatgenRecogniser() self.decoder.setup(argv) def flush(self): """ Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task Returns: self - The instance of KaldiASR """ self.decoder.reset(keep_buffer_data=False) return self def rec_in(self, frame): """Queueing in audio chunk Defines asynchronous interface for speech recognition. Args: frame(asr.components.hub.messages.Frame): store pcm payload Returns: self - The instance of KaldiASR """ frame_total, start = 0, time.clock() self.decoder.frame_in(frame.payload) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2)) dec_t = self.decoder.decode(max_frames=self.max_dec_frames) while dec_t > 0: frame_total += dec_t dec_t = self.decoder.decode(max_frames=self.max_dec_frames) if self.cfg['ASR']['Kaldi']['debug']: if (frame_total > 0): self.syslog.debug('Forward decoding of %d frames in %s secs' % (frame_total, str(time.clock() - start))) return self def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) if self.calibration_table: lat = lattice_calibration(lat, self.calibration_table) self.last_lattice = lat # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist def word_post_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypotheses in about the input speech audio. """ # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.last_lattice = lat self.decoder.reset(keep_buffer_data=False) # Convert lattice to word nblist return lattice_to_word_posterior_lists(lat, self.n_best) def get_last_lattice(self): return self.last_lattice
class LiveDemo(object): def __init__(self, audio_batch_size, wst, dec_args): self.batch_size = audio_batch_size self.wst = wst self.args = dec_args self.d = PyOnlineLatgenRecogniser() self.pin, self.stream = None, None self.frames = [] self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False def setup(self): self.d.reset() self.d.setup(argv) self.pin = pyaudio.PyAudio() self.stream = self.pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=self.batch_size, stream_callback=self.get_audio_callback()) self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False self.frames = [] def tear_down(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() if self.pin is not None: self.pin.terminate() p, stream = None, None self.frames = [] def get_audio_callback(self): def frame_in(in_data, frame_count, time_info, status): self.d.frame_in(in_data) self.frames.append(in_data) return in_data, pyaudio.paContinue return frame_in def _user_control(self): '''Simply stupid sollution how to control state of recogniser.''' self.utt_end, self.dialog_end = False, False old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) # if is data on input while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])): c = sys.stdin.read(1) if c == 'u': print('\nMarked end of utterance\n') self.utt_end = True elif c == 'c': self.dialog_end = True print('\nMarked end of dialogue\n') finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) print( """Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end)) def run(self): while True: time.sleep(0.1) self._user_control() new_frames = self.d.decode(max_frames=10) while new_frames > 0: self.utt_frames += new_frames new_frames = self.d.decode(max_frames=10) if self.utt_end or self.dialog_end: start = time.time() self.d.prune_final() prob, lat = self.d.get_lattice() # lat.write('live-demo-recorded.fst') nbest = lattice_to_nbest(lat, n=10) if nbest: best_prob, best_path = nbest[0] decoded = ' '.join([wst[w] for w in best_path]) else: decoded = 'Empty hypothesis' print( "%s secs, frames: %d, prob: %f, %s " % (str(time.time() - start), self.utt_frames, prob, decoded)) self.utt_frames = 0 self.d.reset(keep_buffer_data=False) if self.dialog_end: self.save_wav() break def save_wav(self): wf = wave.open('live-demo-record.wav', 'wb') wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(self.pin.get_sample_size(FORMAT)) wf.writeframes(b''.join(self.frames)) wf.close()
class KaldiASR(ASRInterface): """ Wraps Kaldi PyOnlineLatgenRecogniser, which firstly decodes in forward direction and generate on demand lattice by traversing pruned decoding graph backwards. """ def __init__(self, cfg): """ Create KaldiASR instance and sets it according configuration Args: cfg(dict): Alex configuration """ super(KaldiASR, self).__init__(cfg) kcfg = self.cfg['ASR']['Kaldi'] if os.path.isfile(kcfg['silent_phones']): # replace the path of the file with its content with open(kcfg['silent_phones'], 'r') as r: kcfg['silent_phones'] = r.read() self.wst = kaldi.utils.wst2dict(kcfg['wst']) self.max_dec_frames = kcfg['max_dec_frames'] self.n_best = kcfg['n_best'] if not 'matrix' in kcfg: kcfg['matrix'] = '' # some models e.g. tri2a does not use matrix # specify all other options in config argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s " "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg) argv = argv.split() with open(kcfg['config']) as r: conf_opt = r.read() self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt)) self.decoder = PyOnlineLatgenRecogniser() self.decoder.setup(argv) def flush(self): """ Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task Returns: self - The instance of KaldiASR """ self.decoder.reset(keep_buffer_data=False) return self def rec_in(self, frame): """Queueing in audio chunk Defines asynchronous interface for speech recognition. Args: frame(asr.components.hub.messages.Frame): store pcm payload Returns: self - The instance of KaldiASR """ frame_total, start = 0, time.clock() self.decoder.frame_in(frame.payload) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2)) dec_t = self.decoder.decode(max_frames=self.max_dec_frames) while dec_t > 0: frame_total += dec_t dec_t = self.decoder.decode(max_frames=self.max_dec_frames) if self.cfg['ASR']['Kaldi']['debug']: if (frame_total > 0): self.syslog.debug('Forward decoding of %d frames in %s secs' % ( frame_total, str(time.clock() - start))) return self def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice() # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist def word_post_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypotheses in about the input speech audio. """ # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice() # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) # Convert lattice to word nblist return lattice_to_word_posterior_lists(lat, self.n_best)
d.reset(keep_buffer_data=False) return result def get_audio_callback(): """Returns a callback - function which handle incomming audio""" def frame_in(in_data, frame_count, time_info, status): d.frame_in(in_data) return in_data, pyaudio.paContinue return frame_in @app.route('/') def index(): return render_template('index.html') audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2] argv = sys.argv[3:] print >> sys.stderr, 'Python args: %s' % str(sys.argv) wst = wst2dict(wst_path) d.setup(argv) pin = pyaudio.PyAudio() stream = pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=audio_batch_size, stream_callback=get_audio_callback()) app.run(host='0.0.0.0', debug=True)
class LiveDemo: def __init__(self, audio_batch_size, wst, dec_args): self.batch_size = audio_batch_size self.wst = wst self.args = dec_args self.d = PyOnlineLatgenRecogniser() self.pin, self.stream = None, None self.frames = [] self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False def setup(self): self.d.reset() self.d.setup(argv) self.pin = pyaudio.PyAudio() self.stream = self.pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=self.batch_size, stream_callback=self.get_audio_callback()) self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False self.frames = [] def tear_down(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() if self.pin is not None: self.pin.terminate() p, stream = None, None self.frames = [] def get_audio_callback(self): def frame_in(in_data, frame_count, time_info, status): self.d.frame_in(in_data) self.frames.append(in_data) return in_data, pyaudio.paContinue return frame_in def _user_control(self): '''Simply stupid sollution how to control state of recogniser.''' self.utt_end, self.dialog_end = False, False old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) # if is data on input while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])): c = sys.stdin.read(1) if c == 'u': print('\nMarked end of utterance\n') self.utt_end = True elif c == 'c': self.dialog_end = True print('\nMarked end of dialogue\n') finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) print("""Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end)) def run(self): while True: time.sleep(0.1) self._user_control() new_frames = self.d.decode(max_frames=10) while new_frames > 0: self.utt_frames += new_frames new_frames = self.d.decode(max_frames=10) if self.utt_end or self.dialog_end: start = time.time() self.d.prune_final() prob, lat = self.d.get_lattice() # lat.write('live-demo-recorded.fst') nbest = lattice_to_nbest(lat, n=10) if nbest: best_prob, best_path = nbest[0] decoded = ' '.join([wst[w] for w in best_path]) else: decoded = 'Empty hypothesis' print("%s secs, frames: %d, prob: %f, %s " % ( str(time.time() - start), self.utt_frames, prob, decoded)) self.utt_frames = 0 self.d.reset(keep_buffer_data=False) if self.dialog_end: self.save_wav() break def save_wav(self): wf = wave.open('live-demo-record.wav', 'wb') wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(self.pin.get_sample_size(FORMAT)) wf.writeframes(b''.join(self.frames)) wf.close()
def get_audio_callback(): """Returns a callback - function which handle incomming audio""" def frame_in(in_data, frame_count, time_info, status): d.frame_in(in_data) return in_data, pyaudio.paContinue return frame_in @app.route('/') def index(): return render_template('index.html') audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2] argv = sys.argv[3:] print >> sys.stderr, 'Python args: %s' % str(sys.argv) wst = wst2dict(wst_path) d.setup(argv) pin = pyaudio.PyAudio() stream = pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=audio_batch_size, stream_callback=get_audio_callback()) app.run(host='0.0.0.0', debug=True)