def __init__(self, corpus_name, create_db=False): self.corpus_name = corpus_name self.ts = {} self.tsdir = TSDIR % corpus_name if create_db: if not os.path.exists(self.tsdir): logging.info('creating %s' % self.tsdir) misc.mkdirs(self.tsdir) for tsfn in os.listdir(self.tsdir): print(tsfn) if not tsfn.startswith('transcripts') or not tsfn.endswith('.csv'): continue with codecs.open('%s/%s' % (self.tsdir, tsfn), 'r', 'utf8') as f: while True: line = f.readline().rstrip() if not line: break parts = line.split(';') # print repr(parts) if len(parts) != 6: raise Exception("***ERROR in transcripts: %s" % line) cfn = parts[0] dirfn = parts[1] audiofn = parts[2] prompt = parts[3] ts = parts[4] quality = int(parts[5]) spk = cfn.split('-')[0] v = { 'cfn': cfn, 'dirfn': dirfn, 'audiofn': audiofn, 'prompt': prompt, 'ts': ts, 'quality': quality, 'spk': spk, 'corpus_name': self.corpus_name } self.ts[cfn] = v
def export_gpt2(self, offset=0, debug_limit=0): for mn2 in self.all_skills: self.consult_skill(mn2) self.setup_nlp_model() logging.info('load discourses from db...') inps = set() for dr in self.session.query(model.TrainingData).filter( model.TrainingData.lang == self.lang): if not dr.inp in inps: inps.add(dr.inp) user_uri = USER_PREFIX + 'gpt2' ctx = self.create_context(user=user_uri, realm='__gpt2__') cnt = 0 misc.mkdirs('zamiaai-gpt2') for inp in sorted(inps): if cnt < offset: cnt += 1 continue try: logging.info(u'%07d/%07d QUES : %s' % (cnt, len(inps), inp)) out, score, action = self.process_input(ctx, inp, run_trace=False) logging.info(u'%07d/%07d RESP: [%6.1f] %s ' % (cnt, len(inps), score, out)) data = {"info": "", "dlg": [{'q': inp, 'a': out}]} datafn = 'zamiaai-gpt2/%07d.json' % cnt with codecs.open(datafn, 'w', 'utf8') as dataf: dataf.write(json.dumps(data)) # logging.info('%s written.' % datafn) cnt += 1 except: logging.error('EXCEPTION')
def create_training_data_for_language_model(transcript_objs, utt_dict, data_dir): transcripts = {} for transcript_obj in transcript_objs: transcripts.update(transcript_obj.ts) misc.mkdirs('%s/local/lm' % data_dir) fn = '%s/local/lm/train_nounk.txt' % data_dir with open(fn, 'w') as f: for utt_id in sorted(transcripts): ts = transcripts[utt_id] f.write((u'%s\n' % ts['ts']).encode('utf8')) logging.info("%s written." % fn) fn = '%s/local/lm/wordlist.txt' % data_dir with open(fn, 'w') as f: for token in sorted(utt_dict): f.write((u'%s\n' % token).encode('utf8')) logging.info("%s written." % fn)
def export_kaldi_data(wav16_dir, audio_corpora, destdirfn, tsdict): logging.info("Exporting kaldi data to %s..." % destdirfn) misc.mkdirs(destdirfn) with open(destdirfn+'wav.scp','w') as wavscpf, \ open(destdirfn+'utt2spk','w') as utt2spkf, \ open(destdirfn+'text','w') as textf: for utt_id in sorted(tsdict): ts = tsdict[utt_id] textf.write((u'%s %s\n' % (utt_id, ts['ts'])).encode('utf8')) wavscpf.write('%s %s/%s/%s.wav\n' % (utt_id, wav16_dir, ts['corpus_name'], utt_id)) utt2spkf.write('%s %s\n' % (utt_id, ts['spk']))
def do_save_audio (): global prompt, vf_login, rec_dir, recording, stdscr ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d') audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds) logging.debug('audiodirfn: %s' % audiodirfn) misc.mkdirs(audiodirfn) cnt = 0 while True: cnt += 1 audiofn = '%s/de5-%03d.wav' % (audiodirfn, cnt) if not os.path.isfile(audiofn): break logging.debug('audiofn: %s' % audiofn) # create wav file wf = wave.open(audiofn, 'wb') wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(SAMPLE_RATE) packed_audio = struct.pack('%sh' % len(recording), *recording) wf.writeframes(packed_audio) wf.close() # append etc/prompts-original file etcdirfn = '%s/%s-%s-rec/etc' % (rec_dir, vf_login, ds) logging.debug('etcdirfn: %s' % etcdirfn) misc.mkdirs(etcdirfn) promptsfn = '%s/prompts-original' % etcdirfn with codecs.open(promptsfn, 'a') as promptsf: promptsf.write('de5-%03d %s\n' % (cnt, prompt)) misc.message_popup(stdscr, 'WAVE file written', audiofn) stdscr.getch()
def copy_scripts_and_config_files(work_dir, kaldi_root): misc.copy_file('data/src/speech/kaldi-run-lm.sh', '%s/run-lm.sh' % work_dir) # misc.copy_file ('data/src/speech/kaldi-run-am.sh', '%s/run-am.sh' % work_dir) # misc.copy_file ('data/src/speech/kaldi-run-nnet3.sh', '%s/run-nnet3.sh' % work_dir) misc.copy_file('data/src/speech/kaldi-run-chain.sh', '%s/run-chain.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-wrapper.sh', # '%s/run-chain-wrapper.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-cfg.sh', # '%s/run-chain-cfg.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-cpu.sh', # '%s/run-chain-cpu.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-cpu-wrapper.sh', # '%s/run-chain-cpu-wrapper.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-gpu.sh', # '%s/run-chain-gpu.sh' % work_dir) # misc.copy_file('data/src/speech/kaldi-run-chain-gpu-wrapper.sh', # '%s/run-chain-gpu-wrapper.sh' % work_dir) misc.copy_file('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % work_dir) misc.render_template('data/src/speech/kaldi-path.sh.template', '%s/path.sh' % work_dir, kaldi_root=kaldi_root) misc.mkdirs('%s/conf' % work_dir) misc.copy_file('data/src/speech/kaldi-mfcc.conf', '%s/conf/mfcc.conf' % work_dir) misc.copy_file('data/src/speech/kaldi-mfcc-hires.conf', '%s/conf/mfcc_hires.conf' % work_dir) misc.copy_file('data/src/speech/kaldi-online-cmvn.conf', '%s/conf/online_cmvn.conf' % work_dir) misc.mkdirs('%s/local' % work_dir) misc.copy_file('data/src/speech/kaldi-score.sh', '%s/local/score.sh' % work_dir) misc.mkdirs('%s/local/nnet3' % work_dir) misc.copy_file('data/src/speech/kaldi-run-ivector-common.sh', '%s/local/nnet3/run_ivector_common.sh' % work_dir)
def export_kaldi_data (destdirfn, tsdict): global wav16_dir logging.info ( "Exporting to %s..." % destdirfn) misc.mkdirs(destdirfn) with open(destdirfn+'wav.scp','w') as wavscpf, \ open(destdirfn+'utt2spk','w') as utt2spkf, \ open(destdirfn+'text','w') as textf: for utt_id in sorted(tsdict): ts = tsdict[utt_id] textf.write((u'%s %s\n' % (utt_id, ts['ts'])).encode('utf8')) wavscpf.write('%s %s/%s.wav\n' % (utt_id, wav16_dir, utt_id)) utt2spkf.write('%s %s\n' % (utt_id, ts['spk'])) misc.copy_file ('data/src/speech/%s/spk2gender' % options.lang, '%s/spk2gender' % destdirfn)
def create_basic_work_dir_structure(data_dir, wav16_dir, mfcc_dir, work_dir, language_model_dir, kaldi_root): # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink(language_model_dir, '%s/lm' % work_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir)
continue speaker = options.speaker1 if resp == '1' else options.speaker2 # does a directory for recordings of this speaker already exist? speakerdirfn = None for fn in os.listdir(options.outdir): if fn.startswith(speaker): speakerdirfn = '%s/%s' % (options.outdir, fn) break if not speakerdirfn: ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d') speakerdirfn = '%s/%s-%s-rec' % (options.outdir, speaker, ds) misc.mkdirs('%s/wav' % speakerdirfn) misc.mkdirs('%s/etc' % speakerdirfn) destfn = '%s/wav/%s' % (speakerdirfn, os.path.basename(segmentfn)) os.rename(segmentfn, destfn) print "moved %s to %s" % (segmentfn, destfn) promptsfn = '%s/etc/prompts-original' % speakerdirfn with codecs.open(promptsfn, 'a', 'utf8') as promptsf: wavbn = os.path.basename(segmentfn) wavbn = os.path.splitext(wavbn)[0] promptsf.write(u'%s %s\n' % (wavbn, prompt)) print "%s written." % promptsfn next_segment() play_wav()
else: logging.basicConfig(level=logging.INFO) lang = options.lang use_prompts = options.use_prompts if len(args) != 1: logging.error("Exactly one corpus (text or speech) must be provided.") parser.print_help() sys.exit(1) corpus = args[0] misc.mkdirs(TEXT_CORPORA_DIR) out_file = '%s/%s.txt' % (TEXT_CORPORA_DIR, corpus) with codecs.open(out_file, "w", "utf-8") as outf: # I haven't figured out how to refactor the processing algorithms of the # parole corpus to implement a generator. if corpus == "parole_de": corpus_path = config.get("speech", corpus) proc_parole_de(corpus_path, load_punkt_tokenizer, outf) elif corpus in TEXT_CORPORA: corpus_path = config.get("speech", corpus) for sentence in TEXT_CORPORA[corpus](corpus_path): outf.write(sentence + "\n") elif corpus in SPEECH_CORPORA: for sentence in SPEECH_CORPORA[corpus]():
# # audio, prompts # for subset in os.listdir(SRCDIR): if not subset in SUBSETS: continue for speaker in os.listdir(SRCDIR + '/' + subset): for book_id in os.listdir(SRCDIR + '/' + subset + '/' + speaker): folder = 'librispeech%s-%s' % (speaker, book_id) dstdir = '%s/%s' % (DESTDIR, folder) misc.mkdirs('%s/flac' % dstdir) misc.mkdirs('%s/etc' % dstdir) promptsfn = '%s/etc/prompts-original' % dstdir transfn = '%s/%s/%s/%s/%s-%s.trans.txt' % ( SRCDIR, subset, speaker, book_id, speaker, book_id) with codecs.open(promptsfn, 'w', 'utf8') as promptsf: with codecs.open(transfn, 'r', 'utf8') as transf: for line in transf: parts = line.split() promptsf.write(line) flac_src = '%s/%s/%s/%s/%s.flac' % ( SRCDIR, subset, speaker, book_id, parts[0]) flac_dst = '%s/flac/%s.flac' % (dstdir, parts[0])
dict_name = options.dict_name workdir = 'data/dst/dict-models/%s/sequitur' % dict_name # # load lexicon # logging.info("loading lexicon...") lex = Lexicon(file_name=dict_name) logging.info("loading lexicon...done.") # # export # misc.mkdirs(workdir) with codecs.open('%s/train.lex' % workdir, 'w', 'utf8') as trainf, \ codecs.open('%s/test.lex' % workdir, 'w', 'utf8') as testf, \ codecs.open('%s/all.lex' % workdir, 'w', 'utf8') as allf : cnt = 0 for word in lex: ipa = lex[word]['ipa'] xs = ipa2xsampa(word, ipa, spaces=True, stress_to_vowels=False) if cnt % 10 == 0: testf.write(u'%s %s\n' % (word, xs))
def export_sphinx_case(work_dir, sphinxtrain_cfg_fn): # # language model # misc.mkdirs('%s' % work_dir) fn = '%s/prompts.sent' % work_dir with codecs.open(fn, 'w', 'utf8') as outf: for cfn in ts_all: transcript = transcripts[cfn]['ts'] outf.write('%s\n' % transcript) logging.info("%s written." % fn) fn = '%s/wlist.txt' % work_dir with codecs.open(fn, 'w', 'utf8') as outf: for word in lex: if ENABLE_NOISE_FILLER: if word == NOISE_WORD: logging.debug('skipping noise word') continue outf.write('%s\n' % word) logging.info("%s written." % fn) # # create work_dir structure # mfcc_dir = "%s/mfcc" % work_dir misc.mkdirs('%s/logs' % work_dir) misc.mkdirs('%s/etc' % work_dir) misc.mkdirs('%s' % mfcc_dir) # generate sphinx_train.cfg, featdir in there # inf = codecs.open ('data/src/speech/sphinx_train.cfg', 'r', 'utf8') # outf = codecs.open ('%s/etc/sphinx_train.cfg' % work_dir, 'w', 'utf8') # for line in inf: # s = line.replace('%FEATDIR%', mfcc_dir).replace('%WORKDIR%', work_dir) # outf.write (s) # inf.close() # outf.close() misc.copy_file(sphinxtrain_cfg_fn, '%s/etc/sphinx_train.cfg' % work_dir) if ENABLE_NOISE_FILLER: misc.copy_file('data/src/speech/sphinx-voxforge-noise.filler', '%s/etc/voxforge.filler' % work_dir) else: misc.copy_file('data/src/speech/sphinx-voxforge.filler', '%s/etc/voxforge.filler' % work_dir) misc.copy_file('data/src/speech/sphinx-feat.params', '%s/etc/feat.params' % work_dir) # # prompts # train_fifn = '%s/etc/voxforge_train.fileids' % work_dir train_tsfn = '%s/etc/voxforge_train.transcription' % work_dir test_fifn = '%s/etc/voxforge_test.fileids' % work_dir test_tsfn = '%s/etc/voxforge_test.transcription' % work_dir runfeatfn = '%s/run-feat.sh' % work_dir lex_covered = set() SPHINXFE = "sphinx_fe -i '%s' -part 1 -npart 1 -ei wav -o '%s' -eo mfc -nist no -raw no -mswav yes -samprate 16000 -lowerf 130 -upperf 6800 -nfilt 25 -transform dct -lifter 22 >>logs/mfcc%02d.log 2>&1 &\n" with codecs.open(runfeatfn, 'w', 'utf8') as runfeatf: runfeatf.write('#!/bin/bash\n\n') cnt = 0 for cfn in ts_all: w16filename = "%s/%s/%s.wav" % (wav16_dir, cfn2corpus[cfn], cfn) mfcfilename = "mfcc/%s.mfc" % cfn runfeatf.write(SPHINXFE % (w16filename, mfcfilename, cnt)) cnt = (cnt + 1) % NJOBS if cnt == 0: runfeatf.write('wait\n') logging.info("%s written." % runfeatfn) with codecs.open (train_fifn, 'w', 'utf8') as train_fif, \ codecs.open (train_tsfn, 'w', 'utf8') as train_tsf, \ codecs.open (test_fifn, 'w', 'utf8') as test_fif, \ codecs.open (test_tsfn, 'w', 'utf8') as test_tsf: for cfn in ts_train: train_fif.write('%s\n' % cfn) tokens = tokenize(ts_train[cfn]['ts'], lang=options.lang, keep_punctuation=False) ts = u' '.join(tokens) train_tsf.write(u'<s> %s </s> (%s)\n' % (ts, cfn)) for token in tokens: if not token in lex: logging.error('word %s not covered by dict!') sys.exit(1) lex_covered.add(token) for cfn in ts_test: test_fif.write('%s\n' % cfn) tokens = tokenize(ts_test[cfn]['ts'], lang=options.lang, keep_punctuation=False) ts = u' '.join(tokens) test_tsf.write(u'<s> %s </s> (%s)\n' % (ts, cfn)) for token in tokens: if not token in lex: logging.error('word %s not covered by dict!') sys.exit(1) lex_covered.add(token) logging.info("%s written." % train_tsfn) logging.info("%s written." % train_fifn) logging.info("%s written." % test_tsfn) logging.info("%s written." % test_fifn) # generate dict phoneset = set() pdfn = '%s/etc/voxforge.dic' % work_dir with codecs.open(pdfn, 'w', 'utf8') as pdf: for word in lex: if ENABLE_NOISE_FILLER: if word == NOISE_WORD: logging.debug('skipping noise word') continue if not word in lex_covered: logging.debug( 'skipping word %s as it is not covered by transcripts' % word) continue ipa = lex[word]['ipa'] xs = ipa2xsampa(word, ipa) xa = xsampa2xarpabet(word, xs) pdf.write(u'%s %s\n' % (word, xa)) phones = xa.split(' ') for phone in phones: if len(phone.strip()) == 0: logging.error( u"***ERROR: empty phone detected in lex entry %s %s" % (word, ipa)) phoneset.add(phone) logging.info("%s written." % pdfn) logging.info("Got %d phones." % len(phoneset)) phfn = '%s/etc/voxforge.phone' % work_dir with codecs.open(phfn, 'w', 'utf8') as phf: for phone in phoneset: phf.write(u'%s\n' % phone) phf.write(u'SIL\n') if ENABLE_NOISE_FILLER: phf.write(u'NSPC\n') logging.info("%s written." % phfn) misc.render_template('data/src/speech/sphinx-run.sh.template', '%s/sphinx-run.sh' % work_dir, lm_name=lm_name)
def on_message(client, userdata, message): # global kernal, lang global msg_queue, msg_cond, ignore_audio_before global wfs, vf_login, rec_dir, audiofns, pstr, hstr, astr, audio_cnt global do_listen, do_rec, do_asr, att_force, listening, attention global tts_lock, tts # logging.debug( "message received %s" % str(message.payload.decode("utf-8"))) # logging.debug( "message topic=%s" % message.topic) # logging.debug( "message qos=%s" % message.qos) # logging.debug( "message retain flag=%s" % message.retain) msg_cond.acquire() try: if message.topic == TOPIC_INPUT_AUDIO: data = json.loads(message.payload) data['topic'] = message.topic audio = data['pcm'] loc = data['loc'] do_finalize = data['final'] ts = dateutil.parser.parse(data['ts']) # ignore old audio recordings that may have lingered in the message queue age = (datetime.datetime.now() - ts).total_seconds() if age > MAX_AUDIO_AGE: # logging.debug (" ignoring audio that is too old: %fs > %fs" % (age, MAX_AUDIO_AGE)) return if ts < ignore_audio_before: # logging.debug (" ignoring audio that is ourselves talking: %s < %s" % (unicode(ts), unicode(ignore_audio_before))) return audio_cnt += 1 pstr = '.' * (audio_cnt / 10 + 1) if do_rec: # store recording in WAV format if not loc in wfs: wfs[loc] = None if not wfs[loc]: ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d') audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds) logging.debug('audiodirfn: %s' % audiodirfn) misc.mkdirs(audiodirfn) cnt = 0 while True: cnt += 1 audiofns[loc] = '%s/de5-%03d.wav' % (audiodirfn, cnt) if not os.path.isfile(audiofns[loc]): break logging.debug('audiofn: %s' % audiofns[loc]) # create wav file wfs[loc] = wave.open(audiofns[loc], 'wb') wfs[loc].setnchannels(1) wfs[loc].setsampwidth(2) wfs[loc].setframerate(SAMPLE_RATE) packed_audio = struct.pack('%sh' % len(audio), *audio) wfs[loc].writeframes(packed_audio) if do_finalize: afn_parts = audiofns[loc].split('/') pstr = afn_parts[len(afn_parts) - 1] logging.info('audiofn %s written.' % audiofns[loc]) wfs[loc].close() wfs[loc] = None else: audiofns[loc] = '' if do_finalize: pstr = '***' if do_finalize: audio_cnt = 0 if do_asr: msg_queue.append(data) msg_cond.notify_all() else: if do_rec: attention = 30 publish_state(client) elif message.topic == TOPIC_INPUT_TEXT: data = json.loads(message.payload) data['topic'] = message.topic msg_queue.append(data) msg_cond.notify_all() # print data elif message.topic == TOPIC_RESPONSE: msg = json.loads(message.payload) if msg['utt']: tts_lock.acquire() try: logging.debug('tts.say...') tts.say(msg['utt']) logging.debug('tts.say finished.') except: logging.error('TTS EXCEPTION CAUGHT %s' % traceback.format_exc()) finally: tts_lock.release() ignore_audio_before = datetime.datetime.now( ) + datetime.timedelta(seconds=AUDIO_EXTRA_DELAY) listening = True publish_state(client) elif message.topic == TOPIC_CONFIG: logging.debug("message received %s" % str(message.payload.decode("utf-8"))) logging.debug("message topic=%s" % message.topic) logging.debug("message qos=%s" % message.qos) logging.debug("message retain flag=%s" % message.retain) data = json.loads(message.payload) do_listen = data['listen'] do_rec = data['record'] do_asr = data['asr'] att_force2 = data['att'] if att_force2: attention = 30 att_force = True elif att_force: attention = 2 att_force = False publish_state(client) except: logging.error('EXCEPTION CAUGHT %s' % traceback.format_exc()) finally: msg_cond.release()
lex = Lexicon(file_name=dict_name) logging.info("loading lexicon...done.") # # cleanup leftovers from previous runs # cmd = 'rm -rf %s' % dst_dir logging.info(cmd) os.system(cmd) # # dictionary export # misc.mkdirs('%s/data/local/dict' % dst_dir) dictfn2 = '%s/data/local/dict/lexicon.txt' % dst_dir logging.info("Exporting dictionary...") ps = {} with open(dictfn2, 'w') as dictf: dictf.write('!SIL SIL\n') for token in sorted(lex): multi = lex.get_multi(token) for form in multi: ipa = multi[form]['ipa']
continue word = parts[0] ipa = parts[1] token = word.replace(u"·", u"").lower() wiktionary[token] = (word, ipa) print "loading wiktionary... done. %d entries." % len(wiktionary) # # export training data for sequitur # os.system("rm -rf %s" % WORKDIR) misc.mkdirs(WORKDIR) num_missing = 0 num_found = 0 with codecs.open('%s/train.lex' % WORKDIR, 'w', 'utf8') as trainf, \ codecs.open('%s/test.lex' % WORKDIR, 'w', 'utf8') as testf, \ codecs.open('%s/all.lex' % WORKDIR, 'w', 'utf8') as allf : cnt = 0 for token in lex: if not token in wiktionary: # print u"Missing in wiktionary: %s" % token num_missing += 1 else:
logging.info("conv_action: %s" % repr(action)) if ai_utt: tts.say(ai_utt) print # # save audio recording, if requested # if options.record_audio: ds = datetime.date.strftime(datetime.date.today(), '%Y%m%d') audiodirfn = '%s/%s-%s-rec/wav' % (rec_dir, vf_login, ds) misc.mkdirs(audiodirfn) cnt = 0 while True: cnt += 1 audiofn = '%s/de5-%03d.wav' % (audiodirfn, cnt) if not os.path.isfile(audiofn): break # create wav file wf = wave.open(audiofn, 'wb') wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(SAMPLE_RATE)
def train(self, num_steps, incremental): # load discourses from db logging.info('load discourses from db...') self.training_data = [] tds = set() for td in self.session.query(model.TrainingData).filter(model.TrainingData.lang==LANG).filter(model.TrainingData.module!='bots'): if td.inp in tds: continue tds.add(td.inp) inp = tokenize(td.inp, lang=LANG) if len(inp) > INPUT_MAX_LEN: inp = inp[:INPUT_MAX_LEN] self.training_data.append((inp, td.module)) if DEBUG_LIMIT and len(tds)>DEBUG_LIMIT: break shuffle (self.training_data) # # set up model dir # if not incremental: try: shutil.rmtree(MODEL_DIR) except: pass misc.mkdirs(MODEL_DIR) # # load or create input/output dicts # if incremental: logging.info("loading input and output dicts...") self.load_dicts() else: logging.info("computing input and output dicts...") self.compute_dicts() self.save_dicts() # # compute datasets # logging.info("computing datasets...") train_x = [] train_y = [] cnt = 0 for inp, mn in self.training_data: x = self.compute_x(inp) y = self.compute_y(mn) train_x.append(x) train_y.append(y) cnt += 1 self.train_x = np.array(train_x, np.int32) self.train_y = keras.utils.to_categorical(train_y, len(self.output_dict)) logging.info("computing datasets done. train:x=%s,y=%s" % (self.train_x.shape, self.train_y.shape)) # # define the keras model # self._setup_model() # # fit training data # best_loss = 100.0 best_epoch = 0 for epoch in range(EPOCHS): h = self.keras_model.fit(self.train_x, self.train_y, epochs=1, validation_split=VALIDATION_SPLIT, batch_size=BATCH_SIZE) cur_loss = h.history['val_loss'][0] if cur_loss < best_loss: best_loss = cur_loss best_epoch = epoch logging.info("%3d/%3d *** BEST LOSS SO FAR IN THIS TUN: %f FROM THIS EPOCH" % (epoch+1, EPOCHS, best_loss)) # save the result self.keras_model.save_weights(self.keras_weights_fn, overwrite=True) logging.info ('%s written.' % self.keras_weights_fn) else: logging.info("%3d/%3d --- BEST LOSS SO FAR IN THIS TUN: %f FROM EPOCH %d" % (epoch+1, EPOCHS, best_loss, best_epoch))
work_dir = WORKDIR % options.lang logging.info ('work_dir: %s' % work_dir) logging.info ("loading transcripts...") transcripts = Transcripts(lang=options.lang) logging.info ("loading transcripts... done.") # # merge sentences # logging.info ('merging sentence sources...') mkdirs('%s' % work_dir) num_sentences = 0 train_fn = '%s/train_all.txt' % work_dir with codecs.open (train_fn, 'w', 'utf8') as dstf: logging.info ('adding transcripts...') for cfn in transcripts: ts = transcripts[cfn]['ts'] if len(ts)<2: continue dstf.write(u'%s\n' % ts)
logging.info ( "loading lexicon...") lex = Lexicon(lang=options.lang) logging.info ( "loading lexicon...done.") logging.info ( "loading transcripts...") transcripts = Transcripts(lang=options.lang) ts_all, ts_train, ts_test = transcripts.split(limit=options.debug, add_all=options.add_all) logging.info ( "loading transcripts (%d train, %d test) ...done." % (len(ts_train), len(ts_test))) # # create work_dir structure # misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) # # kaldi data part # def export_kaldi_data (destdirfn, tsdict): global wav16_dir
else: logging.basicConfig(level=logging.INFO) # # config # config = misc.load_config ('.speechrc') speech_arc = config.get("speech", "speech_arc") speech_corpora = config.get("speech", "speech_corpora") # # convert mp3 to wav, in speaker directories # misc.mkdirs('%s/cv_corpus_v3' % (speech_corpora,)) cnt = 0 spk_ids = set() with open('tmp/run_parallel.sh', 'w') as scriptf, \ open('%s/cv_corpus_v3/utt_test.txt' % speech_corpora, 'w') as utt_testf: files = [ ('train.tsv', False), ('dev.tsv', False), ('test.tsv', True), ('validated.tsv', False), # ('other.tsv', False), # ('invalidated.tsv', False), ] for (tsvfn, is_test) in files: with codecs.open('%s/cv_corpus_v3/%s' % (speech_arc, tsvfn), 'r', 'utf8') as tsvf:
def train(self, num_epochs, incremental): # load discourses from db, resolve non-unique inputs (implicit or of responses) logging.info('load discourses from db...') self.drs = {} self.training_data = [] for dr in self.session.query(model.TrainingData).filter(model.TrainingData.lang==self.lang): self.drs[dr.inp] = dr.skill self.training_data.append((tokenize(dr.inp, lang=self.lang), dr.skill)) if DEBUG_LIMIT>0 and len(self.drs)>=DEBUG_LIMIT: logging.warn(' stopped loading discourses because DEBUG_LIMIT of %d was reached.' % DEBUG_LIMIT) break shuffle(self.training_data) # # set up model dir # if not incremental: mkdirs(self.model_dir) # # load word embeddings # self._load_word_embeddings() # # load or create decoder dict # if incremental: logging.info("loading skills dict...") self._load_skills_dict() else: logging.info("computing skills dict...") self._compute_skills_dict() self._save_skills_dict() # # compute datasets # logging.info("computing datasets...") num_decoder_tokens = len (self.skills_dict) encoder_input_data = np.zeros( (len(self.training_data), self.max_inp_len, self.embed_dim), dtype='float32') decoder_target_data = np.zeros( (len(self.training_data), len(self.skills_dict)), dtype='float32') for i, (inp, skill) in enumerate(self.training_data): for j, token in enumerate(inp): if unicode(token) in self.embedding_dict: encoder_input_data[i, j] = self.embedding_dict[unicode(token)] skill_idx = self.skills_dict[skill] decoder_target_data[i, skill_idx] = 1. # logging.debug ("%-10s %2d %s" % (skill, skill_idx, repr(inp))) # import pdb; pdb.set_trace() logging.info("computing datasets done. encoder_input_data.shape=%s, decoder_target_data.shape=%s" % (repr(encoder_input_data.shape), repr(decoder_target_data.shape))) # # LSTM RNN classifier model setup and training starts here # self._create_keras_model() self.keras_model_train.fit([encoder_input_data], decoder_target_data, batch_size=self.batch_size, epochs=num_epochs, validation_split=0.2) self.keras_model_train.save_weights(self.weights_fn) logging.info("weights written to %s ." % self.weights_fn)
srcdir, localedir, gender, speaker, book) if not os.path.exists(metafn): continue with codecs.open(metafn, 'r', 'utf8') as metaf: meta = json.loads(metaf.read()) logging.debug( 'localedir: %s, gender: %6s, speaker: %16s, book: %s' % (localedir, gender, speaker, book)) folder = 'mailabs%s-%s' % (speaker.replace( '_', '').replace('-', ''), book.replace('_', '-')) dstdir = '%s/%s' % (destdir, folder) misc.mkdirs('%s/wav' % dstdir) misc.mkdirs('%s/etc' % dstdir) promptsfn = '%s/etc/prompts-original' % dstdir logging.debug('dstdir: %s, promptsfn: %s' % (dstdir, promptsfn)) with codecs.open(promptsfn, 'w', 'utf8') as promptsf: for wavfn in meta: ts_orig = meta[wavfn]['clean'] uttid = os.path.splitext(wavfn.replace('_', '-'))[0] if uttid in all_utts: logging.error('utterance id not unique:' %
# config = misc.load_config('.speechrc') w2l_env_activate = config.get("speech", "w2l_env_activate") w2l_decoder = config.get("speech", "w2l_decoder") wav16_dir = config.get("speech", "wav16") # # create basic work dir structure # cmd = 'rm -rf %s' % WORK_DIR logging.debug(cmd) os.system(cmd) misc.mkdirs('%s/test' % data_dir) # # scripts # misc.render_template('data/src/speech/w2l_run_auto_review.sh.template', '%s/run_auto_review.sh' % WORK_DIR, w2l_env_activate=w2l_env_activate, w2l_decoder=w2l_decoder, cuda_device=CUDA_DEVICE, w2l_tokensdir='../../data/models/%s' % model_name, w2l_tokens='tokens.txt', w2l_lexicon='../../data/models/%s/lexicon.txt' % model_name, w2l_am='../../data/models/%s/model.bin' % model_name,
# # load transcripts # logging.info ( "loading transcripts...") transcripts = Transcripts(corpus_name=options.lang) logging.info ( "loading transcripts...done. %d transcripts." % len(transcripts)) logging.info ("splitting transcripts...") ts_all, ts_train, ts_test = transcripts.split() logging.info ("splitting transcripts done, %d train, %d test." % (len(ts_train), len(ts_test))) # # create work_dir # misc.mkdirs('%s' % work_dir) # export csv files csv_train_fn = '%s/train.csv' % work_dir csv_dev_fn = '%s/dev.csv' % work_dir csv_test_fn = '%s/test.csv' % work_dir alphabet = set() vocabulary = [] def export_ds(ds, csv_fn): global alphabet cnt = 0
def kaldi_adapt_lm(kaldi_root, src_model_dir, lm_fn, work_dir, dst_model_name): steps_path = '%s/egs/wsj/s5/steps' % kaldi_root if not os.path.exists (steps_path): raise Exception ('%s does not exist - is kaldi really installed in %s ?' % (steps_path, kaldi_root)) tmpl_dir = os.path.dirname(os.path.abspath(__file__)) + '/templates' # # copy dictionary and phoneme sets from original model # logging.info("copying dictionary and phoneme sets from original model...") misc.mkdirs('%s/data/local/dict' % work_dir) misc.copy_file ('%s/data/local/dict/lexicon.txt' % src_model_dir, '%s/data/local/dict/lexicon.txt' % work_dir) misc.copy_file ('%s/data/local/dict/nonsilence_phones.txt' % src_model_dir, '%s/data/local/dict/nonsilence_phones.txt' % work_dir) misc.copy_file ('%s/data/local/dict/silence_phones.txt' % src_model_dir, '%s/data/local/dict/silence_phones.txt' % work_dir) misc.copy_file ('%s/data/local/dict/optional_silence.txt' % src_model_dir, '%s/data/local/dict/optional_silence.txt' % work_dir) misc.copy_file ('%s/data/local/dict/extra_questions.txt' % src_model_dir, '%s/data/local/dict/extra_questions.txt' % work_dir) # # language model # misc.copy_file (lm_fn, '%s/lm.arpa' % work_dir) # # create skeleton dst model # logging.info("creating skeleton destination model...") misc.mkdirs ('%s/exp/adapt' % work_dir) misc.copy_file ('%s/model/final.mdl' % src_model_dir, '%s/exp/adapt/final.mdl' % work_dir) misc.copy_file ('%s/model/cmvn_opts' % src_model_dir, '%s/exp/adapt/cmvn_opts' % work_dir) misc.copy_file ('%s/model/tree' % src_model_dir, '%s/exp/adapt/tree' % work_dir) for optional_file in [ 'final.mat', 'splice_opts', 'final.occs', 'full.mat' ] : if os.path.exists('%s/model/%s' % (src_model_dir, optional_file)): misc.copy_file ('%s/model/%s' % (src_model_dir, optional_file), '%s/exp/adapt/%s' % (work_dir, optional_file)) if os.path.exists('%s/extractor' % src_model_dir): misc.mkdirs ('%s/exp/extractor' % work_dir) misc.copy_file ('%s/extractor/final.mat' % src_model_dir, '%s/exp/extractor/final.mat' % work_dir) misc.copy_file ('%s/extractor/global_cmvn.stats' % src_model_dir, '%s/exp/extractor/global_cmvn.stats' % work_dir) misc.copy_file ('%s/extractor/final.dubm' % src_model_dir, '%s/exp/extractor/final.dubm' % work_dir) misc.copy_file ('%s/extractor/final.ie' % src_model_dir, '%s/exp/extractor/final.ie' % work_dir) misc.copy_file ('%s/extractor/splice_opts' % src_model_dir, '%s/exp/extractor/splice_opts' % work_dir) misc.mkdirs ('%s/exp/ivectors_test_hires/conf' % work_dir) misc.copy_file ('%s/ivectors_test_hires/conf/splice.conf' % src_model_dir, '%s/exp/ivectors_test_hires/conf' % work_dir) misc.mkdirs ('%s/conf' % work_dir) misc.copy_file ('%s/conf/mfcc.conf' % src_model_dir, '%s/conf/mfcc.conf' % work_dir) misc.copy_file ('%s/conf/mfcc_hires.conf' % src_model_dir, '%s/conf/mfcc_hires.conf' % work_dir) misc.copy_file ('%s/conf/online_cmvn.conf' % src_model_dir, '%s/conf/online_cmvn.conf' % work_dir) # # copy scripts and config files # misc.copy_file ('%s/kaldi-run-adaptation.sh' % tmpl_dir, '%s/run-adaptation.sh' % work_dir) misc.copy_file ('%s/kaldi-cmd.sh' % tmpl_dir, '%s/cmd.sh' % work_dir) misc.render_template ('%s/kaldi-path.sh.template' % tmpl_dir, '%s/path.sh' % work_dir, kaldi_root=kaldi_root) misc.copy_file ('%s/kaldi-model-dist.sh' % tmpl_dir, '%s/model-dist.sh' % work_dir) misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) cmd = '/bin/bash -c "pushd %s && bash run-adaptation.sh && popd"' % work_dir logging.info (cmd) os.system (cmd) cmd = '/bin/bash -c "pushd %s && bash model-dist.sh "%s" && popd"' % (work_dir, dst_model_name) logging.info (cmd) os.system (cmd)
# config = misc.load_config('.speechrc') corpora = config.get("speech", "speech_corpora") wav16_dir = config.get("speech", "wav16") out_dir = '%s/%s' % (corpora, corpus_out) tmpfn_base = '/tmp/tmp16_%08x' % os.getpid() if os.path.exists(out_dir): logging.error("%s already exists!" % out_dir) sys.exit(1) logging.info("creating %s ..." % out_dir) misc.mkdirs(out_dir) # # count good transcripts # total_good = 0 for ts in transcripts: if transcripts[ts]['quality'] < MIN_QUALITY: continue total_good += 1 # # main #
# # config # config = misc.load_config('.speechrc') kaldi_root = config.get("speech", "kaldi_root") wav16_dir = config.get("speech", "wav16") # # create basic work dir structure # # FIXME: unused, remove misc.mkdirs('%s/lexicon' % data_dir) misc.mkdirs('%s/local/dict' % data_dir) misc.mkdirs(wav16_dir) misc.mkdirs(mfcc_dir) misc.symlink('../../../../../%s' % language_model_dir, '%s/lm' % work_dir) misc.symlink('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % work_dir) misc.symlink('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % work_dir) # # generate speech and text corpora # logging.info("loading lexicon...") lex = Lexicon(file_name=dictionary) logging.info("loading lexicon...done.") if sequitur_model_path:
(options, args) = parser.parse_args() if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if len(args) < 2: parser.print_usage() sys.exit(1) language_model = args[0] text_corpora = args[1:] outdir = '%s/%s' % (LANGUAGE_MODELS_DIR, language_model) mkdirs(outdir) # # extract sentences into one big text file # train_fn = '%s/train_all.txt' % outdir num_sentences = 0 with codecs.open(str(train_fn), 'w', 'utf8') as dstf: for text_corpus_name in text_corpora: src = '%s/%s.txt' % (TEXT_CORPORA_DIR, text_corpus_name) logging.info('reading from sources %s' % src) with codecs.open(str(src), 'r', 'utf8') as srcf: while True:
srcdirfn = args[1] # # config # kaldi_root = config.get("speech", "kaldi_root") # # clean up leftovers from previous runs # cmd = 'rm -rf %s' % WORKDIR logging.info(cmd) os.system(cmd) misc.mkdirs(WORKDIR) # # copy scripts # misc.copy_file ('data/src/speech/kaldi-run-segmentation.sh', '%s/run-segmentation.sh' % WORKDIR) misc.copy_file ('data/src/speech/kaldi-cmd.sh', '%s/cmd.sh' % WORKDIR) misc.render_template ('data/src/speech/kaldi-path.sh.template', '%s/path.sh' % WORKDIR, kaldi_root=kaldi_root) misc.symlink ('%s/egs/wsj/s5/steps' % kaldi_root, '%s/steps' % WORKDIR) misc.symlink ('%s/egs/wsj/s5/utils' % kaldi_root, '%s/utils' % WORKDIR) # # create skeleton dst model #
def train(self, num_steps, incremental): # load discourses from db, resolve non-unique inputs (implicit or of responses) logging.info('load discourses from db...') drs = {} drs_prio = {} for dr in self.session.query(model.TrainingData).filter( model.TrainingData.lang == self.lang): if not dr.inp in drs: drs[dr.inp] = set() if not dr.inp in drs_prio: drs_prio[dr.inp] = dr.prio if dr.prio > drs_prio[dr.inp]: # discard lower-prio responses logging.info('DRS discarding: %s -> %s' % (dr.inp, repr(drs[dr.inp]))) drs[dr.inp] = set() drs_prio[dr.inp] = dr.prio else: if dr.prio < drs_prio[dr.inp]: logging.info('DRS skipping: %s -> %s' % (dr.inp, repr(dr.resp))) continue drs[dr.inp].add(dr.resp) if DEBUG_LIMIT > 0 and len(drs) >= DEBUG_LIMIT: logging.warn( ' stopped loading discourses because DEBUG_LIMIT of %d was reached.' % DEBUG_LIMIT) break # parse json, implicit or responses: self.training_data = [] for inp in drs: td_inp = list(map(lambda a: unicode(a), json.loads(inp))) td_resp = [] num_resp = 0 for r in drs[inp]: td_r = list(map(lambda a: unicode(a), json.loads(r))) if len(td_resp) > 0: td_resp.append(OR_SYMBOL) td_resp.extend(td_r) if len(td_r) > 0: num_resp += 1 if num_resp > MAX_NUM_RESP: break self.training_data.append((td_inp, td_resp)) # # set up model dir # if not incremental: try: shutil.rmtree(self.model_dir) except: pass mkdirs(self.model_dir) # # 2D diagram of available data # dia = self.compute_2d_diagram() print( " n i o 01020304050607080910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455" ) mol = 0 for inp_len in range(len(dia)): s = 0 l = '' output_len = 0 cnt = 0 for n in dia[inp_len]: if cnt < 56: l += ' ' + self._ascii_art(n) s += n cnt += 1 if n > 0: output_len = cnt if output_len > mol: mol = output_len print('%6d %2d %2d %s' % (s, inp_len + 1, mol, l)) # # load or create input/output dicts # if incremental: logging.info("loading input and output dicts...") self.load_dicts() else: logging.info("computing input and output dicts...") self.compute_dicts() self.save_dicts() # # compute datasets # logging.info("computing datasets...") self.ds_train = [] self.ds_dev = [] cnt = 0 for inp, resp in self.training_data: x = self.compute_x(inp) # print dr.inp, x if len(x) <= 0: logging.error("len(x)<=0: %s -> %s" % (repr(inp), repr(resp))) continue y = self.compute_y(resp) # print dr.resp, y if cnt % 50 == 9: data_set = self.ds_dev else: data_set = self.ds_train data_set.append([x, y]) cnt += 1 logging.info( "computing datasets done. len(ds_train)=%d, len(ds_dev)=%d" % (len(self.ds_train), len(self.ds_dev))) # # seq2seq model setup and training starts here # # # setup config to use BFC allocator config = tf.ConfigProto() # config.gpu_options.allocator_type = 'BFC' with tf.Session(config=config) as tf_session: with open('%s/train.log' % self.model_dir, 'w') as logf: tf_model = self.create_tf_model(tf_session, 'train') # load latest state in incremental mode if incremental: tf_model.restore(tf_session, self.model_fn) # this is the training loop step_time, loss, best_perplexity = 0.0, 0.0, 100000.0 current_step = 0 best_step = 0 # previous_losses = [] while current_step <= num_steps: # get a random training batch and perform a training step on it start_time = time() source, source_len, target, target_len = self._prepare_batch( self.ds_train) step_loss, summary = tf_model.train( tf_session, encoder_inputs=source, encoder_inputs_length=source_len, decoder_inputs=target, decoder_inputs_length=target_len) step_time += (time() - start_time) / STEPS_PER_STAT loss += step_loss / STEPS_PER_STAT current_step = tf_model.global_step.eval() if current_step % STEPS_PER_STAT == 0: # print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float( 'inf') steps_done = tf_model.global_step.eval() eta = (num_steps - steps_done) * step_time # # decrease learning rate if no improvement was seen over last 3 times. # if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): # tf_session.run(tf_model.learning_rate_decay_op) # previous_losses.append(loss) sum_dev_loss = 0.0 # for i in range (NUM_EVAL_STEPS): # # get a random dev batch and perform an eval step on it # source, source_len, target, target_len = self._prepare_batch(self.ds_dev) # dev_loss, summary = tf_model.eval (tf_session, # encoder_inputs=source, encoder_inputs_length=source_len, # decoder_inputs=target, decoder_inputs_length=target_len) # sum_dev_loss += dev_loss # sum_dev_loss /= NUM_EVAL_STEPS num_eval_steps = len(self.ds_dev) / self.batch_size for i in range(num_eval_steps): # get a random dev batch and perform an eval step on it source, source_len, target, target_len = self._prepare_batch( self.ds_dev, i * self.batch_size) dev_loss, summary = tf_model.eval( tf_session, encoder_inputs=source, encoder_inputs_length=source_len, decoder_inputs=target, decoder_inputs_length=target_len) sum_dev_loss += dev_loss sum_dev_loss /= num_eval_steps dev_perplexity = math.exp( sum_dev_loss) if sum_dev_loss < 300 else float( 'inf') log_str = "global step %6d/%6d step-time %.6fs ETA %.2fs train_perpl %.6f dev_perpl %.6f" % \ (steps_done, num_steps, step_time, eta, perplexity, dev_perplexity) logging.info(log_str) logf.write(log_str + '\n') if dev_perplexity < best_perplexity: best_perplexity = dev_perplexity best_step = tf_model.global_step.eval() log_str = " *** best eval result so far" logging.info(log_str) logf.write(log_str + '\n') # tf_model.save(tf_session, self.model_fn, global_step=tf_model.global_step) tf_model.save(tf_session, self.model_fn, global_step=None) step_time, loss = 0.0, 0.0 sys.stdout.flush() logf.flush() logging.info("training finished.")
cmd = './cluster_individual.sh wav/%s' % fn print "%6d/%6d %s" % (cnt, total, cmd) scriptf.write('echo %s\n' %fn) scriptf.write('%s &\n' % cmd) scriptf.write('wait\n') os.system('bash run_parallel.sh') ######################################################## # Get a sample from each file for each cluster # ######################################################## if stage <= 2: misc.mkdirs('sample') # # save all clusters appearing in each file # for f in data/*; do # fname=`echo "$f" | rev | cut -f1 -d'/' | rev` # echo python get_clust.py ${f}/${fname}.c.3.seg # python get_clust.py ${f}/${fname}.c.3.seg # done with open ('run_parallel.sh', 'w') as scriptf: cnt = 0 for fn in os.listdir('data'): cnt += 1 if (cnt % nj) == 0: