Exemplo n.º 1
0
def create_asr():
    import config
    from kaldi.utils import lattice_to_nbest, wst2dict
    from kaldi.decoders import PyOnlineLatgenRecogniser
    from asr_utils import lattice_calibration

    recogniser = PyOnlineLatgenRecogniser()
    recogniser.setup(config.kaldi_config)
    dictionary = wst2dict(config.wst_path)

    path_to_text = PathToText(dictionary)
    to_nbest = ToNBest(path_to_text, lattice_to_nbest, lattice_calibration)
    to_best_path = ToBestPath(path_to_text)

    return ASR(recogniser, to_nbest, to_best_path)
Exemplo n.º 2
0
def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None):
    wst = wst2dict(wst_path)
    d = PyOnlineLatgenRecogniser()
    d.setup(argv)
    for wav_name, wav_path in wav_paths:
        sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
        pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
        print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr))
        lat, lik, decoded_frames = decode(d, pcm)
        lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
        if DEBUG:
            with open('pykaldi_%s.svg' % wav_name, 'w') as f:
                f.write(lat._repr_svg_())
            lat.write('%s_pykaldi.fst' % wav_name)

        print(
            "Log-likelihood per frame for utterance %s is %f over %d frames" %
            (wav_name, int(lik / decoded_frames), decoded_frames))
        word_ids = lattice_to_nbest(lat, n=10)
        write_decoded(file_output, wav_name, word_ids, wst)
def decode_wrap(argv, audio_batch_size, wav_paths,
        file_output, wst_path=None):
    wst = wst2dict(wst_path)
    d = PyOnlineLatgenRecogniser()
    d.setup(argv)
    for wav_name, wav_path in wav_paths:
        sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
        pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
        print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)
        lat, lik, decoded_frames = decode(d, pcm)
        lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
        if DEBUG:
            with open('pykaldi_%s.svg' % wav_name, 'w') as f:
                f.write(lat._repr_svg_())
            lat.write('%s_pykaldi.fst' % wav_name)

        print "Log-likelihood per frame for utterance %s is %f over %d frames" % (
            wav_name, (lik / decoded_frames), decoded_frames)
        word_ids = lattice_to_nbest(lat, n=10)
        write_decoded(file_output, wav_name, word_ids, wst)
Exemplo n.º 4
0
                    decoded = ' '.join([wst[w] for w in best_path])
                else:
                    decoded = 'Empty hypothesis'
                print(
                    "%s secs, frames: %d, prob: %f, %s " %
                    (str(time.time() - start), self.utt_frames, prob, decoded))
                self.utt_frames = 0
                self.d.reset(keep_buffer_data=False)
            if self.dialog_end:
                self.save_wav()
                break

    def save_wav(self):
        wf = wave.open('live-demo-record.wav', 'wb')
        wf.setnchannels(CHANNELS)
        wf.setframerate(RATE)
        wf.setsampwidth(self.pin.get_sample_size(FORMAT))
        wf.writeframes(b''.join(self.frames))
        wf.close()


if __name__ == '__main__':
    audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
    argv = sys.argv[3:]
    print('Python args: %s' % str(sys.argv), file=sys.stderr)

    wst = wst2dict(wst_path)
    demo = LiveDemo(audio_batch_size, wst, argv)
    demo.setup()
    demo.run()
Exemplo n.º 5
0
def create_dictionary(basedir):
    global wst

    wst = wst2dict('%s/models/words.txt' % basedir)
Exemplo n.º 6
0
    d.reset(keep_buffer_data=False)
    return result


def get_audio_callback():
    """Returns a callback - function which handle incomming audio"""
    def frame_in(in_data, frame_count, time_info, status):
        d.frame_in(in_data)
        return in_data, pyaudio.paContinue
    return frame_in


@app.route('/')
def index():
    return render_template('index.html')


audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
argv = sys.argv[3:]
print >> sys.stderr, 'Python args: %s' % str(sys.argv)

wst = wst2dict(wst_path)

d.setup(argv)
pin = pyaudio.PyAudio()
stream = pin.open(format=FORMAT, channels=CHANNELS,
                 rate=RATE, input=True, frames_per_buffer=audio_batch_size,
                            stream_callback=get_audio_callback())

app.run(host='0.0.0.0', debug=True)