def __init__(self, corpus_name, create_db=False):

        self.corpus_name = corpus_name
        self.ts = {}
        self.tsdir = TSDIR % corpus_name

        if create_db:
            if not os.path.exists(self.tsdir):
                logging.info('creating %s' % self.tsdir)
                misc.mkdirs(self.tsdir)

        for tsfn in os.listdir(self.tsdir):
            print(tsfn)
            if not tsfn.startswith('transcripts') or not tsfn.endswith('.csv'):
                continue

            with codecs.open('%s/%s' % (self.tsdir, tsfn), 'r', 'utf8') as f:

                while True:

                    line = f.readline().rstrip()

                    if not line:
                        break

                    parts = line.split(';')
                    # print repr(parts)

                    if len(parts) != 6:
                        raise Exception("***ERROR in transcripts: %s" % line)

                    cfn = parts[0]
                    dirfn = parts[1]
                    audiofn = parts[2]
                    prompt = parts[3]
                    ts = parts[4]
                    quality = int(parts[5])
                    spk = cfn.split('-')[0]

                    v = {
                        'cfn': cfn,
                        'dirfn': dirfn,
                        'audiofn': audiofn,
                        'prompt': prompt,
                        'ts': ts,
                        'quality': quality,
                        'spk': spk,
                        'corpus_name': self.corpus_name
                    }

                    self.ts[cfn] = v
Exemplo n.º 2
0
    def export_gpt2(self, offset=0, debug_limit=0):

        for mn2 in self.all_skills:
            self.consult_skill(mn2)
        self.setup_nlp_model()

        logging.info('load discourses from db...')

        inps = set()
        for dr in self.session.query(model.TrainingData).filter(
                model.TrainingData.lang == self.lang):
            if not dr.inp in inps:
                inps.add(dr.inp)

        user_uri = USER_PREFIX + 'gpt2'
        ctx = self.create_context(user=user_uri, realm='__gpt2__')

        cnt = 0

        misc.mkdirs('zamiaai-gpt2')

        for inp in sorted(inps):

            if cnt < offset:
                cnt += 1
                continue

            try:

                logging.info(u'%07d/%07d QUES : %s' % (cnt, len(inps), inp))

                out, score, action = self.process_input(ctx,
                                                        inp,
                                                        run_trace=False)

                logging.info(u'%07d/%07d RESP: [%6.1f] %s ' %
                             (cnt, len(inps), score, out))

                data = {"info": "", "dlg": [{'q': inp, 'a': out}]}

                datafn = 'zamiaai-gpt2/%07d.json' % cnt
                with codecs.open(datafn, 'w', 'utf8') as dataf:
                    dataf.write(json.dumps(data))

                # logging.info('%s written.' % datafn)

                cnt += 1
            except:
                logging.error('EXCEPTION')
def create_training_data_for_language_model(transcript_objs, utt_dict, data_dir):
    transcripts = {}
    for transcript_obj in transcript_objs:
        transcripts.update(transcript_obj.ts)
    misc.mkdirs('%s/local/lm' % data_dir)
    fn = '%s/local/lm/train_nounk.txt' % data_dir
    with open(fn, 'w') as f:

        for utt_id in sorted(transcripts):
            ts = transcripts[utt_id]
            f.write((u'%s\n' % ts['ts']).encode('utf8'))
    logging.info("%s written." % fn)
    fn = '%s/local/lm/wordlist.txt' % data_dir
    with open(fn, 'w') as f:

        for token in sorted(utt_dict):
            f.write((u'%s\n' % token).encode('utf8'))
    logging.info("%s written." % fn)
Exemplo n.º 4
0
def export_kaldi_data(wav16_dir, audio_corpora, destdirfn, tsdict):
    logging.info("Exporting kaldi data to %s..." % destdirfn)

    misc.mkdirs(destdirfn)

    with open(destdirfn+'wav.scp','w') as wavscpf,  \
         open(destdirfn+'utt2spk','w') as utt2spkf, \
         open(destdirfn+'text','w') as textf:

        for utt_id in sorted(tsdict):
            ts = tsdict[utt_id]

            textf.write((u'%s %s\n' % (utt_id, ts['ts'])).encode('utf8'))

            wavscpf.write('%s %s/%s/%s.wav\n' %
                          (utt_id, wav16_dir, ts['corpus_name'], utt_id))

            utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))
Exemplo n.º 5
0
def do_save_audio ():

    global prompt, vf_login, rec_dir, recording, stdscr

    ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d')
    audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds)
    logging.debug('audiodirfn: %s' % audiodirfn)
    misc.mkdirs(audiodirfn)

    cnt = 0
    while True:
        cnt += 1
        audiofn = '%s/de5-%03d.wav' % (audiodirfn, cnt)
        if not os.path.isfile(audiofn):
            break

    logging.debug('audiofn: %s' % audiofn)

    # create wav file 

    wf = wave.open(audiofn, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(SAMPLE_RATE)

    packed_audio = struct.pack('%sh' % len(recording), *recording)
    wf.writeframes(packed_audio)
    wf.close()  

    # append etc/prompts-original file

    etcdirfn = '%s/%s-%s-rec/etc' % (rec_dir, vf_login, ds)
    logging.debug('etcdirfn: %s' % etcdirfn)
    misc.mkdirs(etcdirfn)

    promptsfn = '%s/prompts-original' % etcdirfn
    with codecs.open(promptsfn, 'a') as promptsf:
        promptsf.write('de5-%03d %s\n' % (cnt, prompt))

    misc.message_popup(stdscr, 'WAVE file written', audiofn)

    stdscr.getch()
def copy_scripts_and_config_files(work_dir, kaldi_root):
    misc.copy_file('data/src/speech/kaldi-run-lm.sh', '%s/run-lm.sh' % work_dir)
    # misc.copy_file ('data/src/speech/kaldi-run-am.sh', '%s/run-am.sh' % work_dir)
    # misc.copy_file ('data/src/speech/kaldi-run-nnet3.sh', '%s/run-nnet3.sh' % work_dir)
    misc.copy_file('data/src/speech/kaldi-run-chain.sh',
                   '%s/run-chain.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-wrapper.sh',
    #                '%s/run-chain-wrapper.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-cfg.sh',
    #                '%s/run-chain-cfg.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-cpu.sh',
    #                '%s/run-chain-cpu.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-cpu-wrapper.sh',
    #                '%s/run-chain-cpu-wrapper.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-gpu.sh',
    #                '%s/run-chain-gpu.sh' % work_dir)
    # misc.copy_file('data/src/speech/kaldi-run-chain-gpu-wrapper.sh',
    #                '%s/run-chain-gpu-wrapper.sh' % work_dir)
    misc.copy_file('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % work_dir)
    misc.render_template('data/src/speech/kaldi-path.sh.template',
                         '%s/path.sh' % work_dir, kaldi_root=kaldi_root)
    misc.mkdirs('%s/conf' % work_dir)
    misc.copy_file('data/src/speech/kaldi-mfcc.conf',
                   '%s/conf/mfcc.conf' % work_dir)
    misc.copy_file('data/src/speech/kaldi-mfcc-hires.conf',
                   '%s/conf/mfcc_hires.conf' % work_dir)
    misc.copy_file('data/src/speech/kaldi-online-cmvn.conf',
                   '%s/conf/online_cmvn.conf' % work_dir)
    misc.mkdirs('%s/local' % work_dir)
    misc.copy_file('data/src/speech/kaldi-score.sh',
                   '%s/local/score.sh' % work_dir)
    misc.mkdirs('%s/local/nnet3' % work_dir)
    misc.copy_file('data/src/speech/kaldi-run-ivector-common.sh',
                   '%s/local/nnet3/run_ivector_common.sh' % work_dir)
Exemplo n.º 7
0
def export_kaldi_data (destdirfn, tsdict):

    global wav16_dir

    logging.info ( "Exporting to %s..." % destdirfn)

    misc.mkdirs(destdirfn)

    with open(destdirfn+'wav.scp','w') as wavscpf,  \
         open(destdirfn+'utt2spk','w') as utt2spkf, \
         open(destdirfn+'text','w') as textf:

        for utt_id in sorted(tsdict):
            ts = tsdict[utt_id]

            textf.write((u'%s %s\n' % (utt_id, ts['ts'])).encode('utf8'))

            wavscpf.write('%s %s/%s.wav\n' % (utt_id, wav16_dir, utt_id))

            utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))

    misc.copy_file ('data/src/speech/%s/spk2gender' % options.lang, '%s/spk2gender' % destdirfn)
def create_basic_work_dir_structure(data_dir, wav16_dir, mfcc_dir, work_dir,
                                    language_model_dir, kaldi_root):
    # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir)
    misc.mkdirs('%s/local/dict' % data_dir)
    misc.mkdirs(wav16_dir)
    misc.mkdirs(mfcc_dir)
    misc.symlink(language_model_dir, '%s/lm' % work_dir)
    misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
    misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
Exemplo n.º 9
0
            continue

        speaker = options.speaker1 if resp == '1' else options.speaker2

        # does a directory for recordings of this speaker already exist?

        speakerdirfn = None
        for fn in os.listdir(options.outdir):
            if fn.startswith(speaker):
                speakerdirfn = '%s/%s' % (options.outdir, fn)
                break
        if not speakerdirfn:
            ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d')
            speakerdirfn = '%s/%s-%s-rec' % (options.outdir, speaker, ds)

        misc.mkdirs('%s/wav' % speakerdirfn)
        misc.mkdirs('%s/etc' % speakerdirfn)

        destfn = '%s/wav/%s' % (speakerdirfn, os.path.basename(segmentfn))
        os.rename(segmentfn, destfn)
        print "moved %s to %s" % (segmentfn, destfn)

        promptsfn = '%s/etc/prompts-original' % speakerdirfn
        with codecs.open(promptsfn, 'a', 'utf8') as promptsf:
            wavbn = os.path.basename(segmentfn)
            wavbn = os.path.splitext(wavbn)[0]
            promptsf.write(u'%s %s\n' % (wavbn, prompt))
        print "%s written." % promptsfn

        next_segment()
        play_wav()
Exemplo n.º 10
0
    else:
        logging.basicConfig(level=logging.INFO)

    lang = options.lang
    use_prompts = options.use_prompts

    if len(args) != 1:
        logging.error("Exactly one corpus (text or speech) must be provided.")

        parser.print_help()

        sys.exit(1)

    corpus = args[0]

    misc.mkdirs(TEXT_CORPORA_DIR)

    out_file = '%s/%s.txt' % (TEXT_CORPORA_DIR, corpus)

    with codecs.open(out_file, "w", "utf-8") as outf:
        # I haven't figured out how to refactor the processing algorithms of the
        # parole corpus to implement a generator.
        if corpus == "parole_de":
            corpus_path = config.get("speech", corpus)
            proc_parole_de(corpus_path, load_punkt_tokenizer, outf)
        elif corpus in TEXT_CORPORA:
            corpus_path = config.get("speech", corpus)
            for sentence in TEXT_CORPORA[corpus](corpus_path):
                outf.write(sentence + "\n")
        elif corpus in SPEECH_CORPORA:
            for sentence in SPEECH_CORPORA[corpus]():
Exemplo n.º 11
0
#
# audio, prompts
#

for subset in os.listdir(SRCDIR):

    if not subset in SUBSETS:
        continue

    for speaker in os.listdir(SRCDIR + '/' + subset):
        for book_id in os.listdir(SRCDIR + '/' + subset + '/' + speaker):

            folder = 'librispeech%s-%s' % (speaker, book_id)
            dstdir = '%s/%s' % (DESTDIR, folder)

            misc.mkdirs('%s/flac' % dstdir)
            misc.mkdirs('%s/etc' % dstdir)

            promptsfn = '%s/etc/prompts-original' % dstdir
            transfn = '%s/%s/%s/%s/%s-%s.trans.txt' % (
                SRCDIR, subset, speaker, book_id, speaker, book_id)

            with codecs.open(promptsfn, 'w', 'utf8') as promptsf:
                with codecs.open(transfn, 'r', 'utf8') as transf:
                    for line in transf:
                        parts = line.split()
                        promptsf.write(line)

                        flac_src = '%s/%s/%s/%s/%s.flac' % (
                            SRCDIR, subset, speaker, book_id, parts[0])
                        flac_dst = '%s/flac/%s.flac' % (dstdir, parts[0])
Exemplo n.º 12
0
dict_name = options.dict_name
workdir = 'data/dst/dict-models/%s/sequitur' % dict_name

#
# load lexicon
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

#
# export
#

misc.mkdirs(workdir)

with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \
     codecs.open('%s/test.lex'  % workdir, 'w', 'utf8') as testf, \
     codecs.open('%s/all.lex'  % workdir, 'w', 'utf8') as allf :

    cnt = 0

    for word in lex:

        ipa = lex[word]['ipa']

        xs = ipa2xsampa(word, ipa, spaces=True, stress_to_vowels=False)

        if cnt % 10 == 0:
            testf.write(u'%s %s\n' % (word, xs))
def export_sphinx_case(work_dir, sphinxtrain_cfg_fn):

    #
    # language model
    #

    misc.mkdirs('%s' % work_dir)

    fn = '%s/prompts.sent' % work_dir

    with codecs.open(fn, 'w', 'utf8') as outf:

        for cfn in ts_all:

            transcript = transcripts[cfn]['ts']

            outf.write('%s\n' % transcript)

    logging.info("%s written." % fn)

    fn = '%s/wlist.txt' % work_dir

    with codecs.open(fn, 'w', 'utf8') as outf:

        for word in lex:

            if ENABLE_NOISE_FILLER:
                if word == NOISE_WORD:
                    logging.debug('skipping noise word')
                    continue

            outf.write('%s\n' % word)

    logging.info("%s written." % fn)

    #
    # create work_dir structure
    #

    mfcc_dir = "%s/mfcc" % work_dir

    misc.mkdirs('%s/logs' % work_dir)
    misc.mkdirs('%s/etc' % work_dir)
    misc.mkdirs('%s' % mfcc_dir)

    # generate sphinx_train.cfg, featdir in there

    # inf = codecs.open ('data/src/speech/sphinx_train.cfg', 'r', 'utf8')
    # outf = codecs.open ('%s/etc/sphinx_train.cfg' % work_dir, 'w', 'utf8')
    # for line in inf:
    #     s = line.replace('%FEATDIR%', mfcc_dir).replace('%WORKDIR%', work_dir)
    #     outf.write (s)
    # inf.close()
    # outf.close()

    misc.copy_file(sphinxtrain_cfg_fn, '%s/etc/sphinx_train.cfg' % work_dir)
    if ENABLE_NOISE_FILLER:
        misc.copy_file('data/src/speech/sphinx-voxforge-noise.filler',
                       '%s/etc/voxforge.filler' % work_dir)
    else:
        misc.copy_file('data/src/speech/sphinx-voxforge.filler',
                       '%s/etc/voxforge.filler' % work_dir)
    misc.copy_file('data/src/speech/sphinx-feat.params',
                   '%s/etc/feat.params' % work_dir)

    #
    # prompts
    #

    train_fifn = '%s/etc/voxforge_train.fileids' % work_dir
    train_tsfn = '%s/etc/voxforge_train.transcription' % work_dir
    test_fifn = '%s/etc/voxforge_test.fileids' % work_dir
    test_tsfn = '%s/etc/voxforge_test.transcription' % work_dir
    runfeatfn = '%s/run-feat.sh' % work_dir

    lex_covered = set()

    SPHINXFE = "sphinx_fe -i '%s' -part 1 -npart 1 -ei wav -o '%s' -eo mfc -nist no -raw no -mswav yes -samprate 16000 -lowerf 130 -upperf 6800 -nfilt 25 -transform dct -lifter 22 >>logs/mfcc%02d.log 2>&1 &\n"
    with codecs.open(runfeatfn, 'w', 'utf8') as runfeatf:

        runfeatf.write('#!/bin/bash\n\n')

        cnt = 0
        for cfn in ts_all:

            w16filename = "%s/%s/%s.wav" % (wav16_dir, cfn2corpus[cfn], cfn)
            mfcfilename = "mfcc/%s.mfc" % cfn
            runfeatf.write(SPHINXFE % (w16filename, mfcfilename, cnt))
            cnt = (cnt + 1) % NJOBS

            if cnt == 0:
                runfeatf.write('wait\n')

    logging.info("%s written." % runfeatfn)

    with codecs.open (train_fifn, 'w', 'utf8') as train_fif, \
         codecs.open (train_tsfn, 'w', 'utf8') as train_tsf, \
         codecs.open (test_fifn,  'w', 'utf8') as test_fif,  \
         codecs.open (test_tsfn,  'w', 'utf8') as test_tsf:

        for cfn in ts_train:
            train_fif.write('%s\n' % cfn)
            tokens = tokenize(ts_train[cfn]['ts'],
                              lang=options.lang,
                              keep_punctuation=False)
            ts = u' '.join(tokens)
            train_tsf.write(u'<s> %s </s> (%s)\n' % (ts, cfn))

            for token in tokens:
                if not token in lex:
                    logging.error('word %s not covered by dict!')
                    sys.exit(1)
                lex_covered.add(token)

        for cfn in ts_test:
            test_fif.write('%s\n' % cfn)
            tokens = tokenize(ts_test[cfn]['ts'],
                              lang=options.lang,
                              keep_punctuation=False)
            ts = u' '.join(tokens)
            test_tsf.write(u'<s> %s </s> (%s)\n' % (ts, cfn))

            for token in tokens:
                if not token in lex:
                    logging.error('word %s not covered by dict!')
                    sys.exit(1)
                lex_covered.add(token)

    logging.info("%s written." % train_tsfn)
    logging.info("%s written." % train_fifn)
    logging.info("%s written." % test_tsfn)
    logging.info("%s written." % test_fifn)

    # generate dict

    phoneset = set()

    pdfn = '%s/etc/voxforge.dic' % work_dir
    with codecs.open(pdfn, 'w', 'utf8') as pdf:

        for word in lex:

            if ENABLE_NOISE_FILLER:
                if word == NOISE_WORD:
                    logging.debug('skipping noise word')
                    continue

            if not word in lex_covered:
                logging.debug(
                    'skipping word %s as it is not covered by transcripts' %
                    word)
                continue

            ipa = lex[word]['ipa']

            xs = ipa2xsampa(word, ipa)
            xa = xsampa2xarpabet(word, xs)

            pdf.write(u'%s %s\n' % (word, xa))

            phones = xa.split(' ')
            for phone in phones:

                if len(phone.strip()) == 0:
                    logging.error(
                        u"***ERROR: empty phone detected in lex entry %s %s" %
                        (word, ipa))

                phoneset.add(phone)

    logging.info("%s written." % pdfn)

    logging.info("Got %d phones." % len(phoneset))

    phfn = '%s/etc/voxforge.phone' % work_dir
    with codecs.open(phfn, 'w', 'utf8') as phf:

        for phone in phoneset:
            phf.write(u'%s\n' % phone)

        phf.write(u'SIL\n')
        if ENABLE_NOISE_FILLER:
            phf.write(u'NSPC\n')

    logging.info("%s written." % phfn)

    misc.render_template('data/src/speech/sphinx-run.sh.template',
                         '%s/sphinx-run.sh' % work_dir,
                         lm_name=lm_name)
Exemplo n.º 14
0
def on_message(client, userdata, message):

    # global kernal, lang
    global msg_queue, msg_cond, ignore_audio_before
    global wfs, vf_login, rec_dir, audiofns, pstr, hstr, astr, audio_cnt
    global do_listen, do_rec, do_asr, att_force, listening, attention
    global tts_lock, tts

    # logging.debug( "message received %s" % str(message.payload.decode("utf-8")))
    # logging.debug( "message topic=%s" % message.topic)
    # logging.debug( "message qos=%s" % message.qos)
    # logging.debug( "message retain flag=%s" % message.retain)

    msg_cond.acquire()
    try:

        if message.topic == TOPIC_INPUT_AUDIO:

            data = json.loads(message.payload)
            data['topic'] = message.topic
            audio = data['pcm']
            loc = data['loc']
            do_finalize = data['final']
            ts = dateutil.parser.parse(data['ts'])

            # ignore old audio recordings that may have lingered in the message queue

            age = (datetime.datetime.now() - ts).total_seconds()
            if age > MAX_AUDIO_AGE:
                # logging.debug ("   ignoring audio that is too old: %fs > %fs" % (age, MAX_AUDIO_AGE))
                return

            if ts < ignore_audio_before:
                # logging.debug ("   ignoring audio that is ourselves talking: %s < %s" % (unicode(ts), unicode(ignore_audio_before)))
                return

            audio_cnt += 1
            pstr = '.' * (audio_cnt / 10 + 1)

            if do_rec:

                # store recording in WAV format

                if not loc in wfs:
                    wfs[loc] = None

                if not wfs[loc]:

                    ds = datetime.date.strftime(datetime.date.today(),
                                                '%Y%m%d')
                    audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds)
                    logging.debug('audiodirfn: %s' % audiodirfn)
                    misc.mkdirs(audiodirfn)

                    cnt = 0
                    while True:
                        cnt += 1
                        audiofns[loc] = '%s/de5-%03d.wav' % (audiodirfn, cnt)
                        if not os.path.isfile(audiofns[loc]):
                            break

                    logging.debug('audiofn: %s' % audiofns[loc])

                    # create wav file

                    wfs[loc] = wave.open(audiofns[loc], 'wb')
                    wfs[loc].setnchannels(1)
                    wfs[loc].setsampwidth(2)
                    wfs[loc].setframerate(SAMPLE_RATE)

                packed_audio = struct.pack('%sh' % len(audio), *audio)
                wfs[loc].writeframes(packed_audio)

                if do_finalize:

                    afn_parts = audiofns[loc].split('/')

                    pstr = afn_parts[len(afn_parts) - 1]
                    logging.info('audiofn %s written.' % audiofns[loc])

                    wfs[loc].close()
                    wfs[loc] = None

            else:
                audiofns[loc] = ''
                if do_finalize:
                    pstr = '***'

            if do_finalize:
                audio_cnt = 0

            if do_asr:

                msg_queue.append(data)
                msg_cond.notify_all()

            else:
                if do_rec:
                    attention = 30

            publish_state(client)

        elif message.topic == TOPIC_INPUT_TEXT:

            data = json.loads(message.payload)
            data['topic'] = message.topic

            msg_queue.append(data)
            msg_cond.notify_all()
            # print data

        elif message.topic == TOPIC_RESPONSE:

            msg = json.loads(message.payload)

            if msg['utt']:

                tts_lock.acquire()
                try:
                    logging.debug('tts.say...')
                    tts.say(msg['utt'])
                    logging.debug('tts.say finished.')

                except:
                    logging.error('TTS EXCEPTION CAUGHT %s' %
                                  traceback.format_exc())
                finally:
                    tts_lock.release()

                ignore_audio_before = datetime.datetime.now(
                ) + datetime.timedelta(seconds=AUDIO_EXTRA_DELAY)

            listening = True
            publish_state(client)

        elif message.topic == TOPIC_CONFIG:

            logging.debug("message received %s" %
                          str(message.payload.decode("utf-8")))
            logging.debug("message topic=%s" % message.topic)
            logging.debug("message qos=%s" % message.qos)
            logging.debug("message retain flag=%s" % message.retain)

            data = json.loads(message.payload)

            do_listen = data['listen']
            do_rec = data['record']
            do_asr = data['asr']
            att_force2 = data['att']
            if att_force2:
                attention = 30
                att_force = True
            elif att_force:
                attention = 2
                att_force = False

            publish_state(client)

    except:
        logging.error('EXCEPTION CAUGHT %s' % traceback.format_exc())

    finally:
        msg_cond.release()
Exemplo n.º 15
0
lex = Lexicon(file_name=dict_name)
logging.info("loading lexicon...done.")

#
# cleanup leftovers from previous runs
#

cmd = 'rm -rf %s' % dst_dir
logging.info(cmd)
os.system(cmd)

#
# dictionary export
#

misc.mkdirs('%s/data/local/dict' % dst_dir)

dictfn2 = '%s/data/local/dict/lexicon.txt' % dst_dir

logging.info("Exporting dictionary...")

ps = {}

with open(dictfn2, 'w') as dictf:

    dictf.write('!SIL SIL\n')

    for token in sorted(lex):
        multi = lex.get_multi(token)
        for form in multi:
            ipa = multi[form]['ipa']
            continue

        word  = parts[0]
        ipa   = parts[1]
        token = word.replace(u"·", u"").lower()

        wiktionary[token] = (word, ipa)

print "loading wiktionary... done. %d entries." % len(wiktionary)

#
# export training data for sequitur
#

os.system("rm -rf %s" % WORKDIR)
misc.mkdirs(WORKDIR)

num_missing = 0
num_found   = 0

with codecs.open('%s/train.lex' % WORKDIR, 'w', 'utf8') as trainf, \
     codecs.open('%s/test.lex'  % WORKDIR, 'w', 'utf8') as testf, \
     codecs.open('%s/all.lex'   % WORKDIR, 'w', 'utf8') as allf :

    cnt = 0

    for token in lex:
        if not token in wiktionary:
            # print u"Missing in wiktionary: %s" % token
            num_missing += 1
        else:
Exemplo n.º 17
0
        logging.info("conv_action: %s" % repr(action))

    if ai_utt:
        tts.say(ai_utt)

    print

    #
    # save audio recording, if requested
    #

    if options.record_audio:

        ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d')
        audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds)
        misc.mkdirs(audiodirfn)

        cnt = 0
        while True:
            cnt += 1
            audiofn = '%s/de5-%03d.wav' % (audiodirfn, cnt)
            if not os.path.isfile(audiofn):
                break

        # create wav file

        wf = wave.open(audiofn, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(SAMPLE_RATE)
Exemplo n.º 18
0
    def train(self, num_steps, incremental):

        # load discourses from db
        
        logging.info('load discourses from db...')

        self.training_data = []
        tds      = set()
        for td in self.session.query(model.TrainingData).filter(model.TrainingData.lang==LANG).filter(model.TrainingData.module!='bots'):

            if td.inp in tds:
                continue
            tds.add(td.inp)

            inp = tokenize(td.inp, lang=LANG)
            if len(inp) > INPUT_MAX_LEN:
                inp = inp[:INPUT_MAX_LEN]

            self.training_data.append((inp, td.module))

            if DEBUG_LIMIT and len(tds)>DEBUG_LIMIT:
                break

        shuffle (self.training_data)

        #
        # set up model dir
        #

        if not incremental:
            try:
                shutil.rmtree(MODEL_DIR)
            except:
                pass

            misc.mkdirs(MODEL_DIR)

        #
        # load or create input/output dicts
        #

        if incremental:
            logging.info("loading input and output dicts...")
            self.load_dicts()

        else:
            logging.info("computing input and output dicts...")

            self.compute_dicts()
            self.save_dicts()

        #
        # compute datasets
        #

        logging.info("computing datasets...")

        train_x = []
        train_y = []

        cnt = 0
        for inp, mn in self.training_data:

            x = self.compute_x(inp)
            y = self.compute_y(mn)

            train_x.append(x)
            train_y.append(y)

            cnt += 1

        self.train_x = np.array(train_x, np.int32)
        self.train_y = keras.utils.to_categorical(train_y, len(self.output_dict))

        logging.info("computing datasets done. train:x=%s,y=%s" % (self.train_x.shape, self.train_y.shape))

        #
        # define the keras model
        #

        self._setup_model()

        #
        # fit training data
        #

        best_loss  = 100.0
        best_epoch = 0

        for epoch in range(EPOCHS):

            h = self.keras_model.fit(self.train_x, self.train_y, 
                                     epochs=1, 
                                     validation_split=VALIDATION_SPLIT, 
                                     batch_size=BATCH_SIZE)

            cur_loss = h.history['val_loss'][0]

            if cur_loss < best_loss:

                best_loss  = cur_loss
                best_epoch = epoch

                logging.info("%3d/%3d *** BEST LOSS SO FAR IN THIS TUN: %f FROM THIS EPOCH" % (epoch+1, EPOCHS, best_loss))

                # save the result

                self.keras_model.save_weights(self.keras_weights_fn, overwrite=True)
                logging.info ('%s written.' % self.keras_weights_fn)
            else:
                logging.info("%3d/%3d --- BEST LOSS SO FAR IN THIS TUN: %f FROM EPOCH %d" % (epoch+1, EPOCHS, best_loss, best_epoch))
Exemplo n.º 19
0
work_dir = WORKDIR % options.lang

logging.info ('work_dir: %s' % work_dir)

logging.info ("loading transcripts...")
transcripts = Transcripts(lang=options.lang)
logging.info ("loading transcripts... done.")

#
# merge sentences
#

logging.info ('merging sentence sources...')

mkdirs('%s' % work_dir)

num_sentences = 0

train_fn = '%s/train_all.txt' % work_dir

with codecs.open (train_fn, 'w', 'utf8') as dstf:

    logging.info ('adding transcripts...')
    for cfn in transcripts:
        ts = transcripts[cfn]['ts']
        if len(ts)<2:
            continue

        dstf.write(u'%s\n' % ts)
Exemplo n.º 20
0
logging.info ( "loading lexicon...")
lex = Lexicon(lang=options.lang)
logging.info ( "loading lexicon...done.")

logging.info ( "loading transcripts...")
transcripts = Transcripts(lang=options.lang)
ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all)
logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test)))

#
# create work_dir structure
#


misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)

misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
# kaldi data part
#

def export_kaldi_data (destdirfn, tsdict):

    global wav16_dir
Exemplo n.º 21
0
else:
    logging.basicConfig(level=logging.INFO)

#
# config
#

config = misc.load_config ('.speechrc')
speech_arc     = config.get("speech", "speech_arc")
speech_corpora = config.get("speech", "speech_corpora")

#
# convert mp3 to wav, in speaker directories
#

misc.mkdirs('%s/cv_corpus_v3' % (speech_corpora,))

cnt = 0
spk_ids = set()
with open('tmp/run_parallel.sh', 'w') as scriptf, \
        open('%s/cv_corpus_v3/utt_test.txt' % speech_corpora, 'w') as utt_testf:
    files = [
        ('train.tsv', False),
        ('dev.tsv', False),
        ('test.tsv', True),
        ('validated.tsv', False),
        # ('other.tsv', False),
        # ('invalidated.tsv', False),
    ]
    for (tsvfn, is_test) in files:
        with codecs.open('%s/cv_corpus_v3/%s' % (speech_arc, tsvfn), 'r', 'utf8') as tsvf:
Exemplo n.º 22
0
    def train(self, num_epochs, incremental):

        # load discourses from db, resolve non-unique inputs (implicit or of responses)
        
        logging.info('load discourses from db...')

        self.drs = {} 
        self.training_data = []
        for dr in self.session.query(model.TrainingData).filter(model.TrainingData.lang==self.lang):

            self.drs[dr.inp] = dr.skill
            self.training_data.append((tokenize(dr.inp, lang=self.lang), dr.skill))

            if DEBUG_LIMIT>0 and len(self.drs)>=DEBUG_LIMIT:
                logging.warn('  stopped loading discourses because DEBUG_LIMIT of %d was reached.' % DEBUG_LIMIT)
                break

        shuffle(self.training_data)
 
        #
        # set up model dir
        #

        if not incremental:
            mkdirs(self.model_dir)

        #
        # load word embeddings
        #

        self._load_word_embeddings()

        #
        # load or create decoder dict
        #

        if incremental:
            logging.info("loading skills dict...")
            self._load_skills_dict()

        else:
            logging.info("computing skills dict...")
            self._compute_skills_dict()
            self._save_skills_dict()

        #
        # compute datasets
        #

        logging.info("computing datasets...")

        num_decoder_tokens = len (self.skills_dict)

        encoder_input_data  = np.zeros( (len(self.training_data), self.max_inp_len,  self.embed_dim),
                                        dtype='float32')
        decoder_target_data = np.zeros( (len(self.training_data), len(self.skills_dict)),
                                        dtype='float32')

        for i, (inp, skill) in enumerate(self.training_data):

            for j, token in enumerate(inp):
                if unicode(token) in self.embedding_dict:
                    encoder_input_data[i, j] = self.embedding_dict[unicode(token)]

            skill_idx = self.skills_dict[skill]

            decoder_target_data[i, skill_idx] = 1.

            # logging.debug ("%-10s %2d %s" % (skill, skill_idx, repr(inp)))

            # import pdb; pdb.set_trace()

        logging.info("computing datasets done. encoder_input_data.shape=%s, decoder_target_data.shape=%s" % (repr(encoder_input_data.shape), repr(decoder_target_data.shape)))

        #
        # LSTM RNN classifier model setup and training starts here
        #

        self._create_keras_model()

        self.keras_model_train.fit([encoder_input_data], decoder_target_data,
                                   batch_size=self.batch_size,
                                   epochs=num_epochs,
                                   validation_split=0.2)

        self.keras_model_train.save_weights(self.weights_fn)

        logging.info("weights written to %s ." % self.weights_fn)
Exemplo n.º 23
0
                        srcdir, localedir, gender, speaker, book)
                    if not os.path.exists(metafn):
                        continue

                    with codecs.open(metafn, 'r', 'utf8') as metaf:
                        meta = json.loads(metaf.read())

                    logging.debug(
                        'localedir: %s, gender: %6s, speaker: %16s, book: %s' %
                        (localedir, gender, speaker, book))

                    folder = 'mailabs%s-%s' % (speaker.replace(
                        '_', '').replace('-', ''), book.replace('_', '-'))
                    dstdir = '%s/%s' % (destdir, folder)

                    misc.mkdirs('%s/wav' % dstdir)
                    misc.mkdirs('%s/etc' % dstdir)

                    promptsfn = '%s/etc/prompts-original' % dstdir
                    logging.debug('dstdir: %s, promptsfn: %s' %
                                  (dstdir, promptsfn))

                    with codecs.open(promptsfn, 'w', 'utf8') as promptsf:
                        for wavfn in meta:

                            ts_orig = meta[wavfn]['clean']
                            uttid = os.path.splitext(wavfn.replace('_',
                                                                   '-'))[0]

                            if uttid in all_utts:
                                logging.error('utterance id not unique:' %
#

config = misc.load_config('.speechrc')

w2l_env_activate = config.get("speech", "w2l_env_activate")
w2l_decoder = config.get("speech", "w2l_decoder")
wav16_dir = config.get("speech", "wav16")

#
# create basic work dir structure
#

cmd = 'rm -rf %s' % WORK_DIR
logging.debug(cmd)
os.system(cmd)
misc.mkdirs('%s/test' % data_dir)

#
# scripts
#

misc.render_template('data/src/speech/w2l_run_auto_review.sh.template',
                     '%s/run_auto_review.sh' % WORK_DIR,
                     w2l_env_activate=w2l_env_activate,
                     w2l_decoder=w2l_decoder,
                     cuda_device=CUDA_DEVICE,
                     w2l_tokensdir='../../data/models/%s' % model_name,
                     w2l_tokens='tokens.txt',
                     w2l_lexicon='../../data/models/%s/lexicon.txt' %
                     model_name,
                     w2l_am='../../data/models/%s/model.bin' % model_name,
Exemplo n.º 25
0
#
# load transcripts
#

logging.info ( "loading transcripts...")
transcripts = Transcripts(corpus_name=options.lang)
logging.info ( "loading transcripts...done. %d transcripts." % len(transcripts))
logging.info ("splitting transcripts...")
ts_all, ts_train, ts_test = transcripts.split()
logging.info ("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test)))

#
# create work_dir 
#

misc.mkdirs('%s' % work_dir)

# export csv files

csv_train_fn = '%s/train.csv' % work_dir
csv_dev_fn   = '%s/dev.csv'   % work_dir
csv_test_fn  = '%s/test.csv'  % work_dir

alphabet = set()
vocabulary = []

def export_ds(ds, csv_fn):

    global alphabet

    cnt = 0
Exemplo n.º 26
0
def kaldi_adapt_lm(kaldi_root, src_model_dir, lm_fn, work_dir, dst_model_name):

    steps_path = '%s/egs/wsj/s5/steps' % kaldi_root
    if not os.path.exists (steps_path):
        raise Exception ('%s does not exist - is kaldi really installed in %s ?' % (steps_path, kaldi_root))

    tmpl_dir = os.path.dirname(os.path.abspath(__file__)) + '/templates'

    #
    # copy dictionary and phoneme sets from original model
    #

    logging.info("copying dictionary and phoneme sets from original model...")

    misc.mkdirs('%s/data/local/dict' % work_dir)
    misc.copy_file ('%s/data/local/dict/lexicon.txt' % src_model_dir,           '%s/data/local/dict/lexicon.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/nonsilence_phones.txt' % src_model_dir, '%s/data/local/dict/nonsilence_phones.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/silence_phones.txt' % src_model_dir,    '%s/data/local/dict/silence_phones.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/optional_silence.txt' % src_model_dir,  '%s/data/local/dict/optional_silence.txt' % work_dir)
    misc.copy_file ('%s/data/local/dict/extra_questions.txt' % src_model_dir,   '%s/data/local/dict/extra_questions.txt' % work_dir)

    #
    # language model 
    #

    misc.copy_file (lm_fn, '%s/lm.arpa' % work_dir)

    #
    # create skeleton dst model
    #

    logging.info("creating skeleton destination model...")

    misc.mkdirs ('%s/exp/adapt'  % work_dir)

    misc.copy_file ('%s/model/final.mdl' % src_model_dir, '%s/exp/adapt/final.mdl' % work_dir)
    misc.copy_file ('%s/model/cmvn_opts' % src_model_dir, '%s/exp/adapt/cmvn_opts' % work_dir)
    misc.copy_file ('%s/model/tree'      % src_model_dir, '%s/exp/adapt/tree'      % work_dir)

    for optional_file in [ 'final.mat', 'splice_opts', 'final.occs', 'full.mat' ] :
        if os.path.exists('%s/model/%s' % (src_model_dir, optional_file)):
            misc.copy_file ('%s/model/%s' % (src_model_dir, optional_file), '%s/exp/adapt/%s' % (work_dir, optional_file))

    if os.path.exists('%s/extractor' % src_model_dir):

        misc.mkdirs ('%s/exp/extractor' % work_dir)

        misc.copy_file ('%s/extractor/final.mat'         % src_model_dir, '%s/exp/extractor/final.mat'         % work_dir)
        misc.copy_file ('%s/extractor/global_cmvn.stats' % src_model_dir, '%s/exp/extractor/global_cmvn.stats' % work_dir)
        misc.copy_file ('%s/extractor/final.dubm'        % src_model_dir, '%s/exp/extractor/final.dubm'        % work_dir)
        misc.copy_file ('%s/extractor/final.ie'          % src_model_dir, '%s/exp/extractor/final.ie'          % work_dir)
        misc.copy_file ('%s/extractor/splice_opts'       % src_model_dir, '%s/exp/extractor/splice_opts'       % work_dir)

        misc.mkdirs ('%s/exp/ivectors_test_hires/conf' % work_dir)

        misc.copy_file ('%s/ivectors_test_hires/conf/splice.conf'       % src_model_dir, '%s/exp/ivectors_test_hires/conf'    % work_dir)

    misc.mkdirs ('%s/conf'  % work_dir)
    misc.copy_file ('%s/conf/mfcc.conf' % src_model_dir,        '%s/conf/mfcc.conf' % work_dir)
    misc.copy_file ('%s/conf/mfcc_hires.conf' % src_model_dir,  '%s/conf/mfcc_hires.conf' % work_dir)
    misc.copy_file ('%s/conf/online_cmvn.conf' % src_model_dir, '%s/conf/online_cmvn.conf' % work_dir)

    #
    # copy scripts and config files
    #
     
    misc.copy_file       ('%s/kaldi-run-adaptation.sh' % tmpl_dir, '%s/run-adaptation.sh' % work_dir)
    misc.copy_file       ('%s/kaldi-cmd.sh' % tmpl_dir,            '%s/cmd.sh' % work_dir)
    misc.render_template ('%s/kaldi-path.sh.template' % tmpl_dir,  '%s/path.sh' % work_dir, kaldi_root=kaldi_root)
    misc.copy_file       ('%s/kaldi-model-dist.sh' % tmpl_dir,     '%s/model-dist.sh' % work_dir)

    misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
    misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

    cmd = '/bin/bash -c "pushd %s && bash run-adaptation.sh && popd"' % work_dir
    logging.info (cmd)
    os.system (cmd)

    cmd = '/bin/bash -c "pushd %s && bash model-dist.sh "%s" && popd"' % (work_dir, dst_model_name)
    logging.info (cmd)
    os.system (cmd)
Exemplo n.º 27
0
#

config = misc.load_config('.speechrc')

corpora = config.get("speech", "speech_corpora")
wav16_dir = config.get("speech", "wav16")

out_dir = '%s/%s' % (corpora, corpus_out)
tmpfn_base = '/tmp/tmp16_%08x' % os.getpid()

if os.path.exists(out_dir):
    logging.error("%s already exists!" % out_dir)
    sys.exit(1)

logging.info("creating %s ..." % out_dir)
misc.mkdirs(out_dir)

#
# count good transcripts
#

total_good = 0
for ts in transcripts:

    if transcripts[ts]['quality'] < MIN_QUALITY:
        continue
    total_good += 1

#
# main
#
Exemplo n.º 28
0
#
# config
#

config = misc.load_config('.speechrc')

kaldi_root = config.get("speech", "kaldi_root")
wav16_dir = config.get("speech", "wav16")

#
# create basic work dir structure
#

# FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir)
misc.mkdirs('%s/local/dict' % data_dir)
misc.mkdirs(wav16_dir)
misc.mkdirs(mfcc_dir)
misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir)
misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir)
misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)

#
# generate speech and text corpora
#

logging.info("loading lexicon...")
lex = Lexicon(file_name=dictionary)
logging.info("loading lexicon...done.")

if sequitur_model_path:
Exemplo n.º 29
0
(options, args) = parser.parse_args()

if options.verbose:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

if len(args) < 2:
    parser.print_usage()
    sys.exit(1)

language_model = args[0]
text_corpora = args[1:]

outdir = '%s/%s' % (LANGUAGE_MODELS_DIR, language_model)
mkdirs(outdir)

#
# extract sentences into one big text file
#

train_fn = '%s/train_all.txt' % outdir

num_sentences = 0

with codecs.open(str(train_fn), 'w', 'utf8') as dstf:
    for text_corpus_name in text_corpora:
        src = '%s/%s.txt' % (TEXT_CORPORA_DIR, text_corpus_name)
        logging.info('reading from sources %s' % src)
        with codecs.open(str(src), 'r', 'utf8') as srcf:
            while True:
Exemplo n.º 30
0
srcdirfn = args[1]

#
# config
#

kaldi_root  = config.get("speech", "kaldi_root")

#
# clean up leftovers from previous runs
#

cmd = 'rm -rf %s' % WORKDIR
logging.info(cmd)
os.system(cmd)
misc.mkdirs(WORKDIR)

#
# copy scripts
#

misc.copy_file ('data/src/speech/kaldi-run-segmentation.sh', '%s/run-segmentation.sh' % WORKDIR)

misc.copy_file ('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % WORKDIR)
misc.render_template ('data/src/speech/kaldi-path.sh.template', '%s/path.sh' % WORKDIR, kaldi_root=kaldi_root)
misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % WORKDIR)
misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % WORKDIR)

#
# create skeleton dst model
#
Exemplo n.º 31
0
    def train(self, num_steps, incremental):

        # load discourses from db, resolve non-unique inputs (implicit or of responses)

        logging.info('load discourses from db...')

        drs = {}
        drs_prio = {}

        for dr in self.session.query(model.TrainingData).filter(
                model.TrainingData.lang == self.lang):

            if not dr.inp in drs:
                drs[dr.inp] = set()

            if not dr.inp in drs_prio:
                drs_prio[dr.inp] = dr.prio

            if dr.prio > drs_prio[dr.inp]:
                # discard lower-prio responses
                logging.info('DRS discarding: %s -> %s' %
                             (dr.inp, repr(drs[dr.inp])))

                drs[dr.inp] = set()
                drs_prio[dr.inp] = dr.prio
            else:
                if dr.prio < drs_prio[dr.inp]:
                    logging.info('DRS skipping: %s -> %s' %
                                 (dr.inp, repr(dr.resp)))
                    continue

            drs[dr.inp].add(dr.resp)
            if DEBUG_LIMIT > 0 and len(drs) >= DEBUG_LIMIT:
                logging.warn(
                    '  stopped loading discourses because DEBUG_LIMIT of %d was reached.'
                    % DEBUG_LIMIT)
                break

        # parse json, implicit or responses:

        self.training_data = []

        for inp in drs:

            td_inp = list(map(lambda a: unicode(a), json.loads(inp)))

            td_resp = []
            num_resp = 0
            for r in drs[inp]:
                td_r = list(map(lambda a: unicode(a), json.loads(r)))
                if len(td_resp) > 0:
                    td_resp.append(OR_SYMBOL)
                td_resp.extend(td_r)
                if len(td_r) > 0:
                    num_resp += 1
                if num_resp > MAX_NUM_RESP:
                    break

            self.training_data.append((td_inp, td_resp))

        #
        # set up model dir
        #

        if not incremental:
            try:
                shutil.rmtree(self.model_dir)
            except:
                pass

            mkdirs(self.model_dir)

        #
        # 2D diagram of available data
        #

        dia = self.compute_2d_diagram()

        print(
            "     n  i  o 01020304050607080910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455"
        )

        mol = 0

        for inp_len in range(len(dia)):
            s = 0
            l = ''
            output_len = 0
            cnt = 0
            for n in dia[inp_len]:
                if cnt < 56:
                    l += ' ' + self._ascii_art(n)
                s += n
                cnt += 1
                if n > 0:
                    output_len = cnt

            if output_len > mol:
                mol = output_len

            print('%6d %2d %2d %s' % (s, inp_len + 1, mol, l))

        #
        # load or create input/output dicts
        #

        if incremental:
            logging.info("loading input and output dicts...")
            self.load_dicts()

        else:
            logging.info("computing input and output dicts...")

            self.compute_dicts()
            self.save_dicts()

        #
        # compute datasets
        #

        logging.info("computing datasets...")

        self.ds_train = []
        self.ds_dev = []

        cnt = 0
        for inp, resp in self.training_data:

            x = self.compute_x(inp)
            # print dr.inp, x

            if len(x) <= 0:
                logging.error("len(x)<=0: %s -> %s" % (repr(inp), repr(resp)))
                continue

            y = self.compute_y(resp)
            # print dr.resp, y

            if cnt % 50 == 9:
                data_set = self.ds_dev
            else:
                data_set = self.ds_train

            data_set.append([x, y])
            cnt += 1

        logging.info(
            "computing datasets done. len(ds_train)=%d, len(ds_dev)=%d" %
            (len(self.ds_train), len(self.ds_dev)))

        #
        # seq2seq model setup and training starts here
        #

        # # setup config to use BFC allocator
        config = tf.ConfigProto()
        # config.gpu_options.allocator_type = 'BFC'

        with tf.Session(config=config) as tf_session:
            with open('%s/train.log' % self.model_dir, 'w') as logf:

                tf_model = self.create_tf_model(tf_session, 'train')

                # load latest state in incremental mode

                if incremental:
                    tf_model.restore(tf_session, self.model_fn)

                # this is the training loop

                step_time, loss, best_perplexity = 0.0, 0.0, 100000.0
                current_step = 0
                best_step = 0
                # previous_losses = []
                while current_step <= num_steps:

                    # get a random training batch and perform a training step on it

                    start_time = time()
                    source, source_len, target, target_len = self._prepare_batch(
                        self.ds_train)

                    step_loss, summary = tf_model.train(
                        tf_session,
                        encoder_inputs=source,
                        encoder_inputs_length=source_len,
                        decoder_inputs=target,
                        decoder_inputs_length=target_len)

                    step_time += (time() - start_time) / STEPS_PER_STAT
                    loss += step_loss / STEPS_PER_STAT
                    current_step = tf_model.global_step.eval()

                    if current_step % STEPS_PER_STAT == 0:

                        # print statistics for the previous epoch.
                        perplexity = math.exp(loss) if loss < 300 else float(
                            'inf')

                        steps_done = tf_model.global_step.eval()
                        eta = (num_steps - steps_done) * step_time

                        # # decrease learning rate if no improvement was seen over last 3 times.
                        # if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                        #     tf_session.run(tf_model.learning_rate_decay_op)
                        # previous_losses.append(loss)

                        sum_dev_loss = 0.0

                        # for i in range (NUM_EVAL_STEPS):

                        #     # get a random dev batch and perform an eval step on it

                        #     source, source_len, target, target_len = self._prepare_batch(self.ds_dev)
                        #     dev_loss, summary = tf_model.eval (tf_session,
                        #                                         encoder_inputs=source, encoder_inputs_length=source_len,
                        #                                         decoder_inputs=target, decoder_inputs_length=target_len)
                        #     sum_dev_loss += dev_loss
                        # sum_dev_loss /= NUM_EVAL_STEPS

                        num_eval_steps = len(self.ds_dev) / self.batch_size

                        for i in range(num_eval_steps):

                            # get a random dev batch and perform an eval step on it

                            source, source_len, target, target_len = self._prepare_batch(
                                self.ds_dev, i * self.batch_size)
                            dev_loss, summary = tf_model.eval(
                                tf_session,
                                encoder_inputs=source,
                                encoder_inputs_length=source_len,
                                decoder_inputs=target,
                                decoder_inputs_length=target_len)
                            sum_dev_loss += dev_loss

                        sum_dev_loss /= num_eval_steps

                        dev_perplexity = math.exp(
                            sum_dev_loss) if sum_dev_loss < 300 else float(
                                'inf')

                        log_str = "global step %6d/%6d step-time %.6fs ETA %.2fs train_perpl %.6f dev_perpl %.6f" % \
                                  (steps_done, num_steps, step_time, eta, perplexity, dev_perplexity)

                        logging.info(log_str)
                        logf.write(log_str + '\n')

                        if dev_perplexity < best_perplexity:
                            best_perplexity = dev_perplexity
                            best_step = tf_model.global_step.eval()

                            log_str = "   *** best eval result so far"
                            logging.info(log_str)
                            logf.write(log_str + '\n')

                            # tf_model.save(tf_session, self.model_fn, global_step=tf_model.global_step)
                            tf_model.save(tf_session,
                                          self.model_fn,
                                          global_step=None)

                        step_time, loss = 0.0, 0.0

                        sys.stdout.flush()
                    logf.flush()

                logging.info("training finished.")
Exemplo n.º 32
0
                cmd = './cluster_individual.sh wav/%s' % fn
                print "%6d/%6d %s" % (cnt, total, cmd)
                scriptf.write('echo %s\n' %fn)
                scriptf.write('%s &\n' % cmd)

        scriptf.write('wait\n')

    os.system('bash run_parallel.sh')

########################################################
# Get a sample from each file for each cluster         #
########################################################

if stage <= 2:

    misc.mkdirs('sample')

    # # save all clusters appearing in each file
    # for f in data/*; do
    #   fname=`echo "$f" | rev | cut -f1 -d'/' | rev`
    #   echo python get_clust.py ${f}/${fname}.c.3.seg
    #   python get_clust.py ${f}/${fname}.c.3.seg
    # done

    with open ('run_parallel.sh', 'w') as scriptf:

        cnt = 0
        for fn in os.listdir('data'):
            cnt += 1

            if (cnt % nj) == 0: