Python PyOnlineLatgenRecogniser.setup示例，kaldi.decoders.PyOnlineLatgenRecogniser.setup Python示例

示例#1

0

显示文件

文件： asr.py 项目： kangliqiang/cloud-asr

def create_recogniser(basedir):
    global recogniser

    recogniser = PyOnlineLatgenRecogniser()
    argv = [
        '--config=%s/models/mfcc.conf' % basedir, '--verbose=0',
        '--max-mem=10000000000', '--lat-lm-scale=15', '--beam=12.0',
        '--lattice-beam=6.0', '--max-active=5000',
        '%s/models/tri2b_bmmi.mdl' % basedir,
        '%s/models/HCLG_tri2b_bmmi.fst' % basedir,
        '1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20:21:22:23:24:25',
        '%s//models/tri2b_bmmi.mat' % basedir
    ]
    recogniser.setup(argv)

示例#2

0

显示文件

文件： asr.py 项目： oplatek/cloud-asr

def create_asr():
    import config
    from kaldi.utils import lattice_to_nbest, wst2dict
    from kaldi.decoders import PyOnlineLatgenRecogniser
    from asr_utils import lattice_calibration

    recogniser = PyOnlineLatgenRecogniser()
    recogniser.setup(config.kaldi_config)
    dictionary = wst2dict(config.wst_path)

    path_to_text = PathToText(dictionary)
    to_nbest = ToNBest(path_to_text, lattice_to_nbest, lattice_calibration)
    to_best_path = ToBestPath(path_to_text)

    return ASR(recogniser, to_nbest, to_best_path)

示例#3

0

显示文件

class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase):
    def setUp(self):
        self.d = PyOnlineLatgenRecogniser()

    def test_setup(self, args=['bad args']):
        self.assertFalse(self.d.setup(args))

    def test_decode(self, max_frames=10):
        self.assertEqual(self.d.decode(max_frames), 0)

    def test_frame_in(self):
        wav = b"ahoj"  # 16 bit audio -> 2 samples
        self.d.frame_in(wav)

    def test_frame_in_assert(self):
        wav = b"cau"  # 16 bit audio ->1.5 samples == bad
        self.assertRaises(AssertionError, lambda: self.d.frame_in(wav))

    def get_best_path(self):
        self.assertEqual(self.d.get_best_path(), [])

    def get_Nbest(self):
        self.assertEqual(self.d.get_Nbest(), [])

    def get_lattice(self):
        self.assertEqual(self.d.get_lattice(), None)

    def reset(self, keep_buffer_data=False):
        self.d.reset(keep_buffer_data)

示例#4

0

显示文件

文件： decoders_test.py 项目： DaiDengxin/pykaldi

class TestPyOnlineLatgenRecogniserNotInit(unittest.TestCase):

    def setUp(self):
        self.d = PyOnlineLatgenRecogniser()

    def test_setup(self, args=['bad args']):
        self.assertFalse(self.d.setup(args))

    def test_decode(self, max_frames=10):
        self.assertEqual(self.d.decode(max_frames), 0)

    def test_frame_in(self):
        wav = b"ahoj"  # 16 bit audio -> 2 samples
        self.d.frame_in(wav)

    def test_frame_in_assert(self):
        wav = b"cau"  # 16 bit audio ->1.5 samples == bad
        self.assertRaises(AssertionError, lambda: self.d.frame_in(wav))

    def get_best_path(self):
        self.assertEqual(self.d.get_best_path(), [])

    def get_Nbest(self):
        self.assertEqual(self.d.get_Nbest(), [])

    def get_lattice(self):
        self.assertEqual(self.d.get_lattice(), None)

    def reset(self, keep_buffer_data=False):
        self.d.reset(keep_buffer_data)

示例#5

0

显示文件

def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None):
    wst = wst2dict(wst_path)
    d = PyOnlineLatgenRecogniser()
    d.setup(argv)
    for wav_name, wav_path in wav_paths:
        sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
        pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
        print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr))
        lat, lik, decoded_frames = decode(d, pcm)
        lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
        if DEBUG:
            with open('pykaldi_%s.svg' % wav_name, 'w') as f:
                f.write(lat._repr_svg_())
            lat.write('%s_pykaldi.fst' % wav_name)

        print(
            "Log-likelihood per frame for utterance %s is %f over %d frames" %
            (wav_name, int(lik / decoded_frames), decoded_frames))
        word_ids = lattice_to_nbest(lat, n=10)
        write_decoded(file_output, wav_name, word_ids, wst)

示例#6

0

显示文件

文件： pykaldi-online-latgen-recogniser.py 项目： 2php/kaldi

def decode_wrap(argv, audio_batch_size, wav_paths,
        file_output, wst_path=None):
    wst = wst2dict(wst_path)
    d = PyOnlineLatgenRecogniser()
    d.setup(argv)
    for wav_name, wav_path in wav_paths:
        sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
        pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
        print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)
        lat, lik, decoded_frames = decode(d, pcm)
        lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
        if DEBUG:
            with open('pykaldi_%s.svg' % wav_name, 'w') as f:
                f.write(lat._repr_svg_())
            lat.write('%s_pykaldi.fst' % wav_name)

        print "Log-likelihood per frame for utterance %s is %f over %d frames" % (
            wav_name, (lik / decoded_frames), decoded_frames)
        word_ids = lattice_to_nbest(lat, n=10)
        write_decoded(file_output, wav_name, word_ids, wst)

示例#7

0

显示文件

class KaldiASR(ASRInterface):
    """ Wraps Kaldi PyOnlineLatgenRecogniser,

    which firstly decodes in forward direction and generate on demand lattice
    by traversing pruned decoding graph backwards.
    """
    def __init__(self, cfg):
        """
        Create KaldiASR instance and sets it according configuration

        Args:
            cfg(dict): Alex configuration
        """
        super(KaldiASR, self).__init__(cfg)
        kcfg = self.cfg['ASR']['Kaldi']
        if os.path.isfile(kcfg['silent_phones']):
            # replace the path of the file with its content
            with open(kcfg['silent_phones'], 'r') as r:
                kcfg['silent_phones'] = r.read()

        self.wst = kaldi.utils.wst2dict(kcfg['wst'])
        self.max_dec_frames = kcfg['max_dec_frames']
        self.n_best = kcfg['n_best']
        if not 'matrix' in kcfg:
            kcfg['matrix'] = ''  # some models e.g. tri2a does not use matrix

        # specify all other options in config
        argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s "
                "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg)
        argv = argv.split()
        with open(kcfg['config']) as r:
            conf_opt = r.read()
            self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt))

        self.calibration_table = kcfg[
            'calibration_table'] if 'calibration_table' in kcfg else None

        self.last_lattice = None

        self.decoder = PyOnlineLatgenRecogniser()
        self.decoder.setup(argv)

    def flush(self):
        """
        Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task

        Returns:
            self - The instance of KaldiASR
        """
        self.decoder.reset(keep_buffer_data=False)
        return self

    def rec_in(self, frame):
        """Queueing in audio chunk

        Defines asynchronous interface for speech recognition.

        Args:
            frame(asr.components.hub.messages.Frame): store pcm payload
        Returns:
            self - The instance of KaldiASR
        """
        frame_total, start = 0, time.clock()
        self.decoder.frame_in(frame.payload)

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.debug('frame_in of %d frames' %
                              (len(frame.payload) / 2))

        dec_t = self.decoder.decode(max_frames=self.max_dec_frames)
        while dec_t > 0:
            frame_total += dec_t
            dec_t = self.decoder.decode(max_frames=self.max_dec_frames)

        if self.cfg['ASR']['Kaldi']['debug']:
            if (frame_total > 0):
                self.syslog.debug('Forward decoding of %d frames in %s secs' %
                                  (frame_total, str(time.clock() - start)))
        return self

    def hyp_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypothesis about the input speech audio.
        """
        start = time.time()

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice(
        )  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        if self.calibration_table:
            lat = lattice_calibration(lat, self.calibration_table)

        self.last_lattice = lat

        # Convert lattice to nblist
        nbest = lattice_to_nbest(lat, self.n_best)
        nblist = UtteranceNBList()

        for w, word_ids in nbest:
            words = u' '.join([self.wst[i] for i in word_ids])

            if self.cfg['ASR']['Kaldi']['debug']:
                self.syslog.debug(words)

            p = exp(-w)
            nblist.add(p, Utterance(words))

        # Log
        if len(nbest) == 0:
            nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__'))

        nblist.merge()

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.info('utterance "likelihood" is %f' % utt_lik)
            self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' %
                              str(time.time() - start))

        return nblist

    def word_post_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypotheses in  about the input speech audio.
        """

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice(
        )  # returns acceptor (py)fst.LogVectorFst
        self.last_lattice = lat

        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to word nblist
        return lattice_to_word_posterior_lists(lat, self.n_best)

    def get_last_lattice(self):
        return self.last_lattice

示例#8

0

显示文件

文件： live-demo.py 项目： Elli-Kafritsa/asr-tts-class-2021

class LiveDemo(object):
    def __init__(self, audio_batch_size, wst, dec_args):
        self.batch_size = audio_batch_size
        self.wst = wst
        self.args = dec_args
        self.d = PyOnlineLatgenRecogniser()
        self.pin, self.stream = None, None
        self.frames = []
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False

    def setup(self):
        self.d.reset()
        self.d.setup(argv)
        self.pin = pyaudio.PyAudio()
        self.stream = self.pin.open(format=FORMAT,
                                    channels=CHANNELS,
                                    rate=RATE,
                                    input=True,
                                    frames_per_buffer=self.batch_size,
                                    stream_callback=self.get_audio_callback())
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False
        self.frames = []

    def tear_down(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
        if self.pin is not None:
            self.pin.terminate()
        p, stream = None, None
        self.frames = []

    def get_audio_callback(self):
        def frame_in(in_data, frame_count, time_info, status):
            self.d.frame_in(in_data)
            self.frames.append(in_data)
            return in_data, pyaudio.paContinue

        return frame_in

    def _user_control(self):
        '''Simply stupid sollution how to control state of recogniser.'''

        self.utt_end, self.dialog_end = False, False
        old_settings = termios.tcgetattr(sys.stdin)
        try:
            tty.setcbreak(sys.stdin.fileno())
            # if is data on input
            while (select.select([sys.stdin], [], [],
                                 1) == ([sys.stdin], [], [])):
                c = sys.stdin.read(1)
                if c == 'u':
                    print('\nMarked end of utterance\n')
                    self.utt_end = True
                elif c == 'c':
                    self.dialog_end = True
                    print('\nMarked end of dialogue\n')
        finally:
            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
        print(
            """Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n"""
            % (len(self.frames), self.utt_frames, self.utt_end))

    def run(self):
        while True:
            time.sleep(0.1)
            self._user_control()
            new_frames = self.d.decode(max_frames=10)
            while new_frames > 0:
                self.utt_frames += new_frames
                new_frames = self.d.decode(max_frames=10)
            if self.utt_end or self.dialog_end:
                start = time.time()
                self.d.prune_final()
                prob, lat = self.d.get_lattice()
                # lat.write('live-demo-recorded.fst')
                nbest = lattice_to_nbest(lat, n=10)
                if nbest:
                    best_prob, best_path = nbest[0]
                    decoded = ' '.join([wst[w] for w in best_path])
                else:
                    decoded = 'Empty hypothesis'
                print(
                    "%s secs, frames: %d, prob: %f, %s " %
                    (str(time.time() - start), self.utt_frames, prob, decoded))
                self.utt_frames = 0
                self.d.reset(keep_buffer_data=False)
            if self.dialog_end:
                self.save_wav()
                break

    def save_wav(self):
        wf = wave.open('live-demo-record.wav', 'wb')
        wf.setnchannels(CHANNELS)
        wf.setframerate(RATE)
        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
        wf.writeframes(b''.join(self.frames))
        wf.close()

示例#9

0

显示文件

文件： pykaldi.py 项目： mkorvas/alex

class KaldiASR(ASRInterface):

    """ Wraps Kaldi PyOnlineLatgenRecogniser,

    which firstly decodes in forward direction and generate on demand lattice
    by traversing pruned decoding graph backwards.
    """

    def __init__(self, cfg):
        """
        Create KaldiASR instance and sets it according configuration

        Args:
            cfg(dict): Alex configuration
        """
        super(KaldiASR, self).__init__(cfg)
        kcfg = self.cfg['ASR']['Kaldi']
        if os.path.isfile(kcfg['silent_phones']):
            # replace the path of the file with its content
            with open(kcfg['silent_phones'], 'r') as r:
                kcfg['silent_phones'] = r.read()

        self.wst = kaldi.utils.wst2dict(kcfg['wst'])
        self.max_dec_frames = kcfg['max_dec_frames']
        self.n_best = kcfg['n_best']
        if not 'matrix' in kcfg:
            kcfg['matrix'] = ''  # some models e.g. tri2a does not use matrix

        # specify all other options in config
        argv = ("--config=%(config)s --verbose=%(verbose)d %(extra_args)s "
                "%(model)s %(hclg)s %(silent_phones)s %(matrix)s" % kcfg)
        argv = argv.split()
        with open(kcfg['config']) as r:
            conf_opt = r.read()
            self.syslog.info('argv: %s\nconfig: %s' % (argv, conf_opt))

        self.decoder = PyOnlineLatgenRecogniser()
        self.decoder.setup(argv)

    def flush(self):
        """
        Resets PyOnlineLatgenRecogniser in order to be ready for next recognition task

        Returns:
            self - The instance of KaldiASR
        """
        self.decoder.reset(keep_buffer_data=False)
        return self

    def rec_in(self, frame):
        """Queueing in audio chunk

        Defines asynchronous interface for speech recognition.

        Args:
            frame(asr.components.hub.messages.Frame): store pcm payload
        Returns:
            self - The instance of KaldiASR
        """
        frame_total, start = 0, time.clock()
        self.decoder.frame_in(frame.payload)

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.debug('frame_in of %d frames' % (len(frame.payload) / 2))

        dec_t = self.decoder.decode(max_frames=self.max_dec_frames)
        while dec_t > 0:
            frame_total += dec_t
            dec_t = self.decoder.decode(max_frames=self.max_dec_frames)

        if self.cfg['ASR']['Kaldi']['debug']:
            if (frame_total > 0):
                self.syslog.debug('Forward decoding of %d frames in %s secs' % (
                    frame_total, str(time.clock() - start)))
        return self

    def hyp_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypothesis about the input speech audio.
        """
        start = time.time()

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice()  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to nblist
        nbest = lattice_to_nbest(lat, self.n_best)
        nblist = UtteranceNBList()
        for w, word_ids in nbest:
            words = u' '.join([self.wst[i] for i in word_ids])

            if self.cfg['ASR']['Kaldi']['debug']:
                self.syslog.debug(words)

            p = exp(-w)
            nblist.add(p, Utterance(words))

        # Log
        if len(nbest) == 0:
            nblist.add(1.0, Utterance('Empty hypothesis: Kaldi __FAIL__'))

        nblist.merge()

        if self.cfg['ASR']['Kaldi']['debug']:
            self.syslog.info('utterance "likelihood" is %f' % utt_lik)
            self.syslog.debug('hyp_out: get_lattice+nbest in %s secs' % str(time.time() - start))

        return nblist

    def word_post_out(self):
        """ This defines asynchronous interface for speech recognition.

        Returns:
            ASR hypotheses in  about the input speech audio.
        """

        # Get hypothesis
        self.decoder.prune_final()
        utt_lik, lat = self.decoder.get_lattice()  # returns acceptor (py)fst.LogVectorFst
        self.decoder.reset(keep_buffer_data=False)

        # Convert lattice to word nblist
        return lattice_to_word_posterior_lists(lat, self.n_best)

示例#10

0

显示文件

文件： live-web-demo.py 项目： DaiDengxin/pykaldi

    d.reset(keep_buffer_data=False)
    return result


def get_audio_callback():
    """Returns a callback - function which handle incomming audio"""
    def frame_in(in_data, frame_count, time_info, status):
        d.frame_in(in_data)
        return in_data, pyaudio.paContinue
    return frame_in


@app.route('/')
def index():
    return render_template('index.html')


audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
argv = sys.argv[3:]
print >> sys.stderr, 'Python args: %s' % str(sys.argv)

wst = wst2dict(wst_path)

d.setup(argv)
pin = pyaudio.PyAudio()
stream = pin.open(format=FORMAT, channels=CHANNELS,
                 rate=RATE, input=True, frames_per_buffer=audio_batch_size,
                            stream_callback=get_audio_callback())

app.run(host='0.0.0.0', debug=True)

示例#11

0

显示文件

文件： live-demo.py 项目： 2php/kaldi

class LiveDemo:

    def __init__(self, audio_batch_size, wst, dec_args):
        self.batch_size = audio_batch_size
        self.wst = wst
        self.args = dec_args
        self.d = PyOnlineLatgenRecogniser()
        self.pin, self.stream = None, None
        self.frames = []
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False

    def setup(self):
        self.d.reset()
        self.d.setup(argv)
        self.pin = pyaudio.PyAudio()
        self.stream = self.pin.open(format=FORMAT, channels=CHANNELS,
                                    rate=RATE, input=True, frames_per_buffer=self.batch_size,
                                    stream_callback=self.get_audio_callback())
        self.utt_frames, self.new_frames = 0, 0
        self.utt_end, self.dialog_end = False, False
        self.frames = []

    def tear_down(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
        if self.pin is not None:
            self.pin.terminate()
        p, stream = None, None
        self.frames = []

    def get_audio_callback(self):
        def frame_in(in_data, frame_count, time_info, status):
            self.d.frame_in(in_data)
            self.frames.append(in_data)
            return in_data, pyaudio.paContinue
        return frame_in

    def _user_control(self):
        '''Simply stupid sollution how to control state of recogniser.'''

        self.utt_end, self.dialog_end = False, False
        old_settings = termios.tcgetattr(sys.stdin)
        try:
            tty.setcbreak(sys.stdin.fileno())
            # if is data on input
            while (select.select([sys.stdin], [], [], 1) == ([sys.stdin], [], [])):
                c = sys.stdin.read(1)
                if c == 'u':
                    print('\nMarked end of utterance\n')
                    self.utt_end = True
                elif c == 'c':
                    self.dialog_end = True
                    print('\nMarked end of dialogue\n')
        finally:
            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
        print("""Chunks: %d ; Utterance %d ; end %d : press 'u'\nFor terminating press 'c'\n\n""" % (len(self.frames), self.utt_frames, self.utt_end))

    def run(self):
        while True:
            time.sleep(0.1)
            self._user_control()
            new_frames = self.d.decode(max_frames=10)
            while new_frames > 0:
                self.utt_frames += new_frames
                new_frames = self.d.decode(max_frames=10)
            if self.utt_end or self.dialog_end:
                start = time.time()
                self.d.prune_final()
                prob, lat = self.d.get_lattice()
                # lat.write('live-demo-recorded.fst')
                nbest = lattice_to_nbest(lat, n=10)
                if nbest:
                    best_prob, best_path = nbest[0]
                    decoded = ' '.join([wst[w] for w in best_path])
                else:
                    decoded = 'Empty hypothesis'
                print("%s secs, frames: %d, prob: %f, %s " % (
                    str(time.time() - start), self.utt_frames, prob, decoded))
                self.utt_frames = 0
                self.d.reset(keep_buffer_data=False)
            if self.dialog_end:
                self.save_wav()
                break

    def save_wav(self):
        wf = wave.open('live-demo-record.wav', 'wb')
        wf.setnchannels(CHANNELS)
        wf.setframerate(RATE)
        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
        wf.writeframes(b''.join(self.frames))
        wf.close()

示例#12

0

显示文件

文件： live-web-demo.py 项目： pikaliov/pykaldi-1

def get_audio_callback():
    """Returns a callback - function which handle incomming audio"""
    def frame_in(in_data, frame_count, time_info, status):
        d.frame_in(in_data)
        return in_data, pyaudio.paContinue

    return frame_in


@app.route('/')
def index():
    return render_template('index.html')


audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
argv = sys.argv[3:]
print >> sys.stderr, 'Python args: %s' % str(sys.argv)

wst = wst2dict(wst_path)

d.setup(argv)
pin = pyaudio.PyAudio()
stream = pin.open(format=FORMAT,
                  channels=CHANNELS,
                  rate=RATE,
                  input=True,
                  frames_per_buffer=audio_batch_size,
                  stream_callback=get_audio_callback())

app.run(host='0.0.0.0', debug=True)