예제 #1
0
class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase):
    def setUp(self):
        self.d = PyOnlineLatgenRecogniser()

    def test_setup(self, args=['bad args']):
        self.assertFalse(self.d.setup(args))

    def test_decode(self, max_frames=10):
        self.assertEqual(self.d.decode(max_frames), 0)

    def test_frame_in(self):
        wav = b"ahoj"  # 16 bit audio -> 2 samples
        self.d.frame_in(wav)

    def test_frame_in_assert(self):
        wav = b"cau"  # 16 bit audio ->1.5 samples == bad
        self.assertRaises(AssertionError, lambda: self.d.frame_in(wav))

    def get_best_path(self):
        self.assertEqual(self.d.get_best_path(), [])

    def get_Nbest(self):
        self.assertEqual(self.d.get_Nbest(), [])

    def get_lattice(self):
        self.assertEqual(self.d.get_lattice(), None)

    def reset(self, keep_buffer_data=False):
        self.d.reset(keep_buffer_data)
예제 #2
0
class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase):

    def setUp(self):
        self.d = PyOnlineLatgenRecogniser()

    def test_setup(self, args=['bad args']):
        self.assertFalse(self.d.setup(args))

    def test_decode(self, max_frames=10):
        self.assertEqual(self.d.decode(max_frames), 0)

    def test_frame_in(self):
        wav = b"ahoj"  # 16 bit audio -> 2 samples
        self.d.frame_in(wav)

    def test_frame_in_assert(self):
        wav = b"cau"  # 16 bit audio ->1.5 samples == bad
        self.assertRaises(AssertionError, lambda: self.d.frame_in(wav))

    def get_best_path(self):
        self.assertEqual(self.d.get_best_path(), [])

    def get_Nbest(self):
        self.assertEqual(self.d.get_Nbest(), [])

    def get_lattice(self):
        self.assertEqual(self.d.get_lattice(), None)

    def reset(self, keep_buffer_data=False):
        self.d.reset(keep_buffer_data)
예제 #3
0
class KaldiASR(ASRInterface):
    """ Wraps Kaldi PyOnlineLatgenRecogniser,

    which firstly decodes in forward direction and generate on demand lattice
    by traversing pruned decoding graph backwards.
    """
    def __init__(self, cfg):
        """
        Create KaldiASR instance and sets it according configuration

        Args:
            cfg(dict): Alex configuration
        """
        super(KaldiASR, self).__init__(cfg)
        kcfg = self.cfg['ASR']['Kaldi']
        if os.path.isfile(kcfg['silent_phones']):
            # replace the path of the file with its content
            with open(kcfg['silent_phones'], 'r') as r:
                kcfg['silent_phones'] = r.read()

        self.wst = kaldi.utils.wst2dict(kcfg['wst'])
        self.max_dec_frames = kcfg['max_dec_frames']
        self.n_best = kcfg['n_best']
        if not 'matrix' in kcfg:
            kcfg['matrix'] = ''  # some models e.g. tri2a does not use matrix

        # specify all other options in config
        argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s "
                "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg)
        argv = argv.split()
        with open(kcfg['config']) as r:
            conf_opt = r.read()
            self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt))

        self.calibration_table = kcfg[
            'calibration_table'] if 'calibration_table' in kcfg else None

        self.last_lattice = None

        self.decoder = PyOnlineLatgenRecogniser()
        self.decoder.setup(argv)

    def flush(self):
        """
        Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task

        Returns:
            self - The instance of KaldiASR
        """
        self.decoder.reset(keep_buffer_data=False)
        return self

    def rec_in(self, frame):
        """Queueing in audio chunk

        Defines asynchronous interface for speech recognition.

        Args:
            frame(asr.components.hub.messages.Frame): store pcm payload
        Returns:
            self - The instance of KaldiASR
        """
        frame_total, start = 0, time.clock()
        self.decoder.frame_in(frame.payload)

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.debug('frame_in of %d frames' %
                              (len(frame.payload) / 2))

        dec_t = self.decoder.decode(max_frames=self.max_dec_frames)
        while dec_t > 0:
            frame_total += dec_t
            dec_t = self.decoder.decode(max_frames=self.max_dec_frames)

        if self.cfg['ASR']['Kaldi']['debug']:
            if (frame_total > 0):
                self.syslog.debug('Forward decoding of %d frames in %s secs' %
                                  (frame_total, str(time.clock() - start)))
        return self

    def hyp_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypothesis about the input speech audio.
        """
        start = time.time()

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice(
        )  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        if self.calibration_table:
            lat = lattice_calibration(lat, self.calibration_table)

        self.last_lattice = lat

        # Convert lattice to nblist
        nbest = lattice_to_nbest(lat, self.n_best)
        nblist = UtteranceNBList()

        for w, word_ids in nbest:
            words = u' '.join([self.wst[i] for i in word_ids])

            if self.cfg['ASR']['Kaldi']['debug']:
                self.syslog.debug(words)

            p = exp(-w)
            nblist.add(p, Utterance(words))

        # Log
        if len(nbest) == 0:
            nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__'))

        nblist.merge()

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.info('utterance "likelihood" is %f' % utt_lik)
            self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' %
                              str(time.time() - start))

        return nblist

    def word_post_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypotheses in  about the input speech audio.
        """

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice(
        )  # returns acceptor (py)fst.LogVectorFst
        self.last_lattice = lat

        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to word nblist
        return lattice_to_word_posterior_lists(lat, self.n_best)

    def get_last_lattice(self):
        return self.last_lattice
예제 #4
0
class LiveDemo(object):
    def __init__(self, audio_batch_size, wst, dec_args):
        self.batch_size = audio_batch_size
        self.wst = wst
        self.args = dec_args
        self.d = PyOnlineLatgenRecogniser()
        self.pin, self.stream = None, None
        self.frames = []
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False

    def setup(self):
        self.d.reset()
        self.d.setup(argv)
        self.pin = pyaudio.PyAudio()
        self.stream = self.pin.open(format=FORMAT,
                                    channels=CHANNELS,
                                    rate=RATE,
                                    input=True,
                                    frames_per_buffer=self.batch_size,
                                    stream_callback=self.get_audio_callback())
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False
        self.frames = []

    def tear_down(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
        if self.pin is not None:
            self.pin.terminate()
        p, stream = None, None
        self.frames = []

    def get_audio_callback(self):
        def frame_in(in_data, frame_count, time_info, status):
            self.d.frame_in(in_data)
            self.frames.append(in_data)
            return in_data, pyaudio.paContinue

        return frame_in

    def _user_control(self):
        '''Simply stupid sollution how to control state of recogniser.'''

        self.utt_end, self.dialog_end = False, False
        old_settings = termios.tcgetattr(sys.stdin)
        try:
            tty.setcbreak(sys.stdin.fileno())
            # if is data on input
            while (select.select([sys.stdin], [], [],
                                 1) == ([sys.stdin], [], [])):
                c = sys.stdin.read(1)
                if c == 'u':
                    print('\nMarked end of utterance\n')
                    self.utt_end = True
                elif c == 'c':
                    self.dialog_end = True
                    print('\nMarked end of dialogue\n')
        finally:
            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
        print(
            """Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n"""
            % (len(self.frames), self.utt_frames, self.utt_end))

    def run(self):
        while True:
            time.sleep(0.1)
            self._user_control()
            new_frames = self.d.decode(max_frames=10)
            while new_frames > 0:
                self.utt_frames += new_frames
                new_frames = self.d.decode(max_frames=10)
            if self.utt_end or self.dialog_end:
                start = time.time()
                self.d.prune_final()
                prob, lat = self.d.get_lattice()
                # lat.write('live-demo-recorded.fst')
                nbest = lattice_to_nbest(lat, n=10)
                if nbest:
                    best_prob, best_path = nbest[0]
                    decoded = ' '.join([wst[w] for w in best_path])
                else:
                    decoded = 'Empty hypothesis'
                print(
                    "%s secs, frames: %d, prob: %f, %s " %
                    (str(time.time() - start), self.utt_frames, prob, decoded))
                self.utt_frames = 0
                self.d.reset(keep_buffer_data=False)
            if self.dialog_end:
                self.save_wav()
                break

    def save_wav(self):
        wf = wave.open('live-demo-record.wav', 'wb')
        wf.setnchannels(CHANNELS)
        wf.setframerate(RATE)
        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
        wf.writeframes(b''.join(self.frames))
        wf.close()
예제 #5
0
파일: pykaldi.py 프로젝트: mkorvas/alex
class KaldiASR(ASRInterface):

    """ Wraps Kaldi PyOnlineLatgenRecogniser,

    which firstly decodes in forward direction and generate on demand lattice
    by traversing pruned decoding graph backwards.
    """

    def __init__(self, cfg):
        """
        Create KaldiASR instance and sets it according configuration

        Args:
            cfg(dict): Alex configuration
        """
        super(KaldiASR, self).__init__(cfg)
        kcfg = self.cfg['ASR']['Kaldi']
        if os.path.isfile(kcfg['silent_phones']):
            # replace the path of the file with its content
            with open(kcfg['silent_phones'], 'r') as r:
                kcfg['silent_phones'] = r.read()

        self.wst = kaldi.utils.wst2dict(kcfg['wst'])
        self.max_dec_frames = kcfg['max_dec_frames']
        self.n_best = kcfg['n_best']
        if not 'matrix' in kcfg:
            kcfg['matrix'] = ''  # some models e.g. tri2a does not use matrix

        # specify all other options in config
        argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s "
                "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg)
        argv = argv.split()
        with open(kcfg['config']) as r:
            conf_opt = r.read()
            self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt))

        self.decoder = PyOnlineLatgenRecogniser()
        self.decoder.setup(argv)

    def flush(self):
        """
        Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task

        Returns:
            self - The instance of KaldiASR
        """
        self.decoder.reset(keep_buffer_data=False)
        return self

    def rec_in(self, frame):
        """Queueing in audio chunk

        Defines asynchronous interface for speech recognition.

        Args:
            frame(asr.components.hub.messages.Frame): store pcm payload
        Returns:
            self - The instance of KaldiASR
        """
        frame_total, start = 0, time.clock()
        self.decoder.frame_in(frame.payload)

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2))

        dec_t = self.decoder.decode(max_frames=self.max_dec_frames)
        while dec_t > 0:
            frame_total += dec_t
            dec_t = self.decoder.decode(max_frames=self.max_dec_frames)

        if self.cfg['ASR']['Kaldi']['debug']:
            if (frame_total > 0):
                self.syslog.debug('Forward decoding of %d frames in %s secs' % (
                    frame_total, str(time.clock() - start)))
        return self

    def hyp_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypothesis about the input speech audio.
        """
        start = time.time()

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice()  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to nblist
        nbest = lattice_to_nbest(lat, self.n_best)
        nblist = UtteranceNBList()
        for w, word_ids in nbest:
            words = u' '.join([self.wst[i] for i in word_ids])

            if self.cfg['ASR']['Kaldi']['debug']:
                self.syslog.debug(words)

            p = exp(-w)
            nblist.add(p, Utterance(words))

        # Log
        if len(nbest) == 0:
            nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__'))

        nblist.merge()

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.info('utterance "likelihood" is %f' % utt_lik)
            self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start))

        return nblist

    def word_post_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypotheses in  about the input speech audio.
        """

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice()  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to word nblist
        return lattice_to_word_posterior_lists(lat, self.n_best)
예제 #6
0
파일: live-demo.py 프로젝트: 2php/kaldi
class LiveDemo:

    def __init__(self, audio_batch_size, wst, dec_args):
        self.batch_size = audio_batch_size
        self.wst = wst
        self.args = dec_args
        self.d = PyOnlineLatgenRecogniser()
        self.pin, self.stream = None, None
        self.frames = []
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False

    def setup(self):
        self.d.reset()
        self.d.setup(argv)
        self.pin = pyaudio.PyAudio()
        self.stream = self.pin.open(format=FORMAT, channels=CHANNELS,
                                    rate=RATE, input=True, frames_per_buffer=self.batch_size,
                                    stream_callback=self.get_audio_callback())
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False
        self.frames = []

    def tear_down(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
        if self.pin is not None:
            self.pin.terminate()
        p, stream = None, None
        self.frames = []

    def get_audio_callback(self):
        def frame_in(in_data, frame_count, time_info, status):
            self.d.frame_in(in_data)
            self.frames.append(in_data)
            return in_data, pyaudio.paContinue
        return frame_in

    def _user_control(self):
        '''Simply stupid sollution how to control state of recogniser.'''

        self.utt_end, self.dialog_end = False, False
        old_settings = termios.tcgetattr(sys.stdin)
        try:
            tty.setcbreak(sys.stdin.fileno())
            # if is data on input
            while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])):
                c = sys.stdin.read(1)
                if c == 'u':
                    print('\nMarked end of utterance\n')
                    self.utt_end = True
                elif c == 'c':
                    self.dialog_end = True
                    print('\nMarked end of dialogue\n')
        finally:
            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
        print("""Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end))

    def run(self):
        while True:
            time.sleep(0.1)
            self._user_control()
            new_frames = self.d.decode(max_frames=10)
            while new_frames > 0:
                self.utt_frames += new_frames
                new_frames = self.d.decode(max_frames=10)
            if self.utt_end or self.dialog_end:
                start = time.time()
                self.d.prune_final()
                prob, lat = self.d.get_lattice()
                # lat.write('live-demo-recorded.fst')
                nbest = lattice_to_nbest(lat, n=10)
                if nbest:
                    best_prob, best_path = nbest[0]
                    decoded = ' '.join([wst[w] for w in best_path])
                else:
                    decoded = 'Empty hypothesis'
                print("%s secs, frames: %d, prob: %f, %s " % (
                    str(time.time() - start), self.utt_frames, prob, decoded))
                self.utt_frames = 0
                self.d.reset(keep_buffer_data=False)
            if self.dialog_end:
                self.save_wav()
                break

    def save_wav(self):
        wf = wave.open('live-demo-record.wav', 'wb')
        wf.setnchannels(CHANNELS)
        wf.setframerate(RATE)
        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
        wf.writeframes(b''.join(self.frames))
        wf.close()