class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase): def setUp(self): self.d = PyOnlineLatgenRecogniser() def test_setup(self, args=['bad args']): self.assertFalse(self.d.setup(args)) def test_decode(self, max_frames=10): self.assertEqual(self.d.decode(max_frames), 0) def test_frame_in(self): wav = b"ahoj" # 16 bit audio -> 2 samples self.d.frame_in(wav) def test_frame_in_assert(self): wav = b"cau" # 16 bit audio ->1.5 samples == bad self.assertRaises(AssertionError, lambda: self.d.frame_in(wav)) def get_best_path(self): self.assertEqual(self.d.get_best_path(), []) def get_Nbest(self): self.assertEqual(self.d.get_Nbest(), []) def get_lattice(self): self.assertEqual(self.d.get_lattice(), None) def reset(self, keep_buffer_data=False): self.d.reset(keep_buffer_data)
class KaldiASR(ASRInterface): """ Wraps Kaldi PyOnlineLatgenRecogniser, which firstly decodes in forward direction and generate on demand lattice by traversing pruned decoding graph backwards. """ def __init__(self, cfg): """ Create KaldiASR instance and sets it according configuration Args: cfg(dict): Alex configuration """ super(KaldiASR, self).__init__(cfg) kcfg = self.cfg['ASR']['Kaldi'] if os.path.isfile(kcfg['silent_phones']): # replace the path of the file with its content with open(kcfg['silent_phones'], 'r') as r: kcfg['silent_phones'] = r.read() self.wst = kaldi.utils.wst2dict(kcfg['wst']) self.max_dec_frames = kcfg['max_dec_frames'] self.n_best = kcfg['n_best'] if not 'matrix' in kcfg: kcfg['matrix'] = '' # some models e.g. tri2a does not use matrix # specify all other options in config argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s " "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg) argv = argv.split() with open(kcfg['config']) as r: conf_opt = r.read() self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt)) self.calibration_table = kcfg[ 'calibration_table'] if 'calibration_table' in kcfg else None self.last_lattice = None self.decoder = PyOnlineLatgenRecogniser() self.decoder.setup(argv) def flush(self): """ Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task Returns: self - The instance of KaldiASR """ self.decoder.reset(keep_buffer_data=False) return self def rec_in(self, frame): """Queueing in audio chunk Defines asynchronous interface for speech recognition. Args: frame(asr.components.hub.messages.Frame): store pcm payload Returns: self - The instance of KaldiASR """ frame_total, start = 0, time.clock() self.decoder.frame_in(frame.payload) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2)) dec_t = self.decoder.decode(max_frames=self.max_dec_frames) while dec_t > 0: frame_total += dec_t dec_t = self.decoder.decode(max_frames=self.max_dec_frames) if self.cfg['ASR']['Kaldi']['debug']: if (frame_total > 0): self.syslog.debug('Forward decoding of %d frames in %s secs' % (frame_total, str(time.clock() - start))) return self def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) if self.calibration_table: lat = lattice_calibration(lat, self.calibration_table) self.last_lattice = lat # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist def word_post_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypotheses in about the input speech audio. """ # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice( ) # returns acceptor (py)fst.LogVectorFst self.last_lattice = lat self.decoder.reset(keep_buffer_data=False) # Convert lattice to word nblist return lattice_to_word_posterior_lists(lat, self.n_best) def get_last_lattice(self): return self.last_lattice
class LiveDemo(object): def __init__(self, audio_batch_size, wst, dec_args): self.batch_size = audio_batch_size self.wst = wst self.args = dec_args self.d = PyOnlineLatgenRecogniser() self.pin, self.stream = None, None self.frames = [] self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False def setup(self): self.d.reset() self.d.setup(argv) self.pin = pyaudio.PyAudio() self.stream = self.pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=self.batch_size, stream_callback=self.get_audio_callback()) self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False self.frames = [] def tear_down(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() if self.pin is not None: self.pin.terminate() p, stream = None, None self.frames = [] def get_audio_callback(self): def frame_in(in_data, frame_count, time_info, status): self.d.frame_in(in_data) self.frames.append(in_data) return in_data, pyaudio.paContinue return frame_in def _user_control(self): '''Simply stupid sollution how to control state of recogniser.''' self.utt_end, self.dialog_end = False, False old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) # if is data on input while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])): c = sys.stdin.read(1) if c == 'u': print('\nMarked end of utterance\n') self.utt_end = True elif c == 'c': self.dialog_end = True print('\nMarked end of dialogue\n') finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) print( """Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end)) def run(self): while True: time.sleep(0.1) self._user_control() new_frames = self.d.decode(max_frames=10) while new_frames > 0: self.utt_frames += new_frames new_frames = self.d.decode(max_frames=10) if self.utt_end or self.dialog_end: start = time.time() self.d.prune_final() prob, lat = self.d.get_lattice() # lat.write('live-demo-recorded.fst') nbest = lattice_to_nbest(lat, n=10) if nbest: best_prob, best_path = nbest[0] decoded = ' '.join([wst[w] for w in best_path]) else: decoded = 'Empty hypothesis' print( "%s secs, frames: %d, prob: %f, %s " % (str(time.time() - start), self.utt_frames, prob, decoded)) self.utt_frames = 0 self.d.reset(keep_buffer_data=False) if self.dialog_end: self.save_wav() break def save_wav(self): wf = wave.open('live-demo-record.wav', 'wb') wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(self.pin.get_sample_size(FORMAT)) wf.writeframes(b''.join(self.frames)) wf.close()
class KaldiASR(ASRInterface): """ Wraps Kaldi PyOnlineLatgenRecogniser, which firstly decodes in forward direction and generate on demand lattice by traversing pruned decoding graph backwards. """ def __init__(self, cfg): """ Create KaldiASR instance and sets it according configuration Args: cfg(dict): Alex configuration """ super(KaldiASR, self).__init__(cfg) kcfg = self.cfg['ASR']['Kaldi'] if os.path.isfile(kcfg['silent_phones']): # replace the path of the file with its content with open(kcfg['silent_phones'], 'r') as r: kcfg['silent_phones'] = r.read() self.wst = kaldi.utils.wst2dict(kcfg['wst']) self.max_dec_frames = kcfg['max_dec_frames'] self.n_best = kcfg['n_best'] if not 'matrix' in kcfg: kcfg['matrix'] = '' # some models e.g. tri2a does not use matrix # specify all other options in config argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s " "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg) argv = argv.split() with open(kcfg['config']) as r: conf_opt = r.read() self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt)) self.decoder = PyOnlineLatgenRecogniser() self.decoder.setup(argv) def flush(self): """ Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task Returns: self - The instance of KaldiASR """ self.decoder.reset(keep_buffer_data=False) return self def rec_in(self, frame): """Queueing in audio chunk Defines asynchronous interface for speech recognition. Args: frame(asr.components.hub.messages.Frame): store pcm payload Returns: self - The instance of KaldiASR """ frame_total, start = 0, time.clock() self.decoder.frame_in(frame.payload) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2)) dec_t = self.decoder.decode(max_frames=self.max_dec_frames) while dec_t > 0: frame_total += dec_t dec_t = self.decoder.decode(max_frames=self.max_dec_frames) if self.cfg['ASR']['Kaldi']['debug']: if (frame_total > 0): self.syslog.debug('Forward decoding of %d frames in %s secs' % ( frame_total, str(time.clock() - start))) return self def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypothesis about the input speech audio. """ start = time.time() # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice() # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) # Convert lattice to nblist nbest = lattice_to_nbest(lat, self.n_best) nblist = UtteranceNBList() for w, word_ids in nbest: words = u' '.join([self.wst[i] for i in word_ids]) if self.cfg['ASR']['Kaldi']['debug']: self.syslog.debug(words) p = exp(-w) nblist.add(p, Utterance(words)) # Log if len(nbest) == 0: nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__')) nblist.merge() if self.cfg['ASR']['Kaldi']['debug']: self.syslog.info('utterance "likelihood" is %f' % utt_lik) self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start)) return nblist def word_post_out(self): """ This defines asynchronous interface for speech recognition. Returns: ASR hypotheses in about the input speech audio. """ # Get hypothesis self.decoder.prune_final() utt_lik, lat = self.decoder.get_lattice() # returns acceptor (py)fst.LogVectorFst self.decoder.reset(keep_buffer_data=False) # Convert lattice to word nblist return lattice_to_word_posterior_lists(lat, self.n_best)
class LiveDemo: def __init__(self, audio_batch_size, wst, dec_args): self.batch_size = audio_batch_size self.wst = wst self.args = dec_args self.d = PyOnlineLatgenRecogniser() self.pin, self.stream = None, None self.frames = [] self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False def setup(self): self.d.reset() self.d.setup(argv) self.pin = pyaudio.PyAudio() self.stream = self.pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=self.batch_size, stream_callback=self.get_audio_callback()) self.utt_frames, self.new_frames = 0, 0 self.utt_end, self.dialog_end = False, False self.frames = [] def tear_down(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() if self.pin is not None: self.pin.terminate() p, stream = None, None self.frames = [] def get_audio_callback(self): def frame_in(in_data, frame_count, time_info, status): self.d.frame_in(in_data) self.frames.append(in_data) return in_data, pyaudio.paContinue return frame_in def _user_control(self): '''Simply stupid sollution how to control state of recogniser.''' self.utt_end, self.dialog_end = False, False old_settings = termios.tcgetattr(sys.stdin) try: tty.setcbreak(sys.stdin.fileno()) # if is data on input while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])): c = sys.stdin.read(1) if c == 'u': print('\nMarked end of utterance\n') self.utt_end = True elif c == 'c': self.dialog_end = True print('\nMarked end of dialogue\n') finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) print("""Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end)) def run(self): while True: time.sleep(0.1) self._user_control() new_frames = self.d.decode(max_frames=10) while new_frames > 0: self.utt_frames += new_frames new_frames = self.d.decode(max_frames=10) if self.utt_end or self.dialog_end: start = time.time() self.d.prune_final() prob, lat = self.d.get_lattice() # lat.write('live-demo-recorded.fst') nbest = lattice_to_nbest(lat, n=10) if nbest: best_prob, best_path = nbest[0] decoded = ' '.join([wst[w] for w in best_path]) else: decoded = 'Empty hypothesis' print("%s secs, frames: %d, prob: %f, %s " % ( str(time.time() - start), self.utt_frames, prob, decoded)) self.utt_frames = 0 self.d.reset(keep_buffer_data=False) if self.dialog_end: self.save_wav() break def save_wav(self): wf = wave.open('live-demo-record.wav', 'wb') wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(self.pin.get_sample_size(FORMAT)) wf.writeframes(b''.join(self.frames)) wf.close()