def main(): dataroot = 'data/1-billion/' traindir = dataroot + 'training-monolingual.tokenized.shuffled/' valid_txt = dataroot + 'heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050' test_txt = dataroot + 'heldout-monolingual.tokenized.shuffled/news.en.heldout-00001-of-00050' for tsize in [1, 2, 4]: print('tsk = {}'.format(tsize)) tskdir = '{}/'.format(tsize) wb.mkdir(tskdir) wb.mkdir(tskdir + 'data') write_train_all = tskdir + 'data/train.txt.all' write_train = tskdir + 'data/train.txt' write_valid = tskdir + 'data/valid.txt' write_test = tskdir + 'data/test.txt' write_count = tskdir + 'data/train.unigram' GetTrainTxt(traindir, write_train_all, tsize) v = dict() GetVocab(write_train_all, v) CutVocab(v, 20000 - 1) # leave a space of <unk> WriteVocab(write_count, v) CutTxt(write_train_all, write_train, v) CutTxt(valid_txt, write_valid, v) CutTxt(test_txt, write_test, v)
def main(): print(sys.argv) if len(sys.argv) == 1: print('\"python run_ngram.py -train\" train \n', '\"python run_ngram.py -rescore\" rescore nbest\n', '\"python run_ngram.py -wer\" compute WER') absdir = os.getcwd() + '/' bindir = absdir + '../../tools/srilm/' workdir = absdir + 'ngramlm/' wb.mkdir(workdir) datas = [absdir + i for i in data()] result_file = absdir + 'models_ppl.txt' # the result file model = ngram.model(bindir, workdir) order_reg = [2, 3, 4, 5] for order in order_reg: write_model = workdir + '{}gram.lm'.format(order) print(write_model) if '-train' in sys.argv: if order_reg.index(order) == 0: model.prepare(datas[0], datas[1], datas[2]) model.train(order, write_model, result_file)
def rescore_all(workdir, nbestdir, config): for tsk in ['nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu']]: print('process ' + tsk) nbest_txt = nbestdir + tsk + '/words_text' outdir = workdir + nbestdir.split('/')[-2] + '/' + tsk + '/' wb.mkdir(outdir) write_lmscore = outdir + 'lmwt.lstm' lstm.rescore(workdir, nbest_txt, write_lmscore, config)
def wer_all(workdir, nbestdir, lmpaths, lmtypes): wb.mkdir(workdir) # calculate the wer for each task, each lmscale, each combination for tsk in ['nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu']]: print(tsk) wb.mkdir(workdir + tsk) fwer = open(workdir + tsk + '/wer.txt', 'wt') read_nbest_txt = nbestdir + tsk + '/words_text' read_transcript = nbestdir + tsk + '/text' read_acscore = nbestdir + tsk + '/acwt' read_gfscore = nbestdir + tsk + '/lmwt.nolm' # remove the <UNK> in nbest read_nbest_rmunk = workdir + tsk + '/words_text_rmunk' nbest_rmUNK(read_nbest_txt, read_nbest_rmunk) # load score acscore = np.array(wb.LoadScore(read_acscore)) gfscore = np.array(wb.LoadScore(read_gfscore)) # load label score_label = wb.LoadLabel(read_acscore) # lm config for lmtype in lmtypes: a = lmtype.split('+') if len(a) == 1: lmscore = np.array(wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk))) elif len(a) == 2: s1 = wb.LoadScore(lmpaths[a[0]].replace('<tsk>', tsk)) s2 = wb.LoadScore(lmpaths[a[1]].replace('<tsk>', tsk)) lmscore = 0.5 * np.array(s1) + 0.5 * np.array(s2) # write lmscore wb.WriteScore(workdir + tsk + '/' + lmtype + '.lmscore', lmscore, score_label) for lmscale in np.linspace(9, 15, 7): write_best = workdir + tsk + '/{}_lmscale={}.best'.format(lmtype, lmscale) wb.GetBest(read_nbest_rmunk, (acscore + lmscale * (lmscore + gfscore)).tolist(), write_best) [err, num, wer] = wb.CmpWER(write_best, read_transcript) os.remove(write_best) s = '{} wer={:.2f} err={} num={} lmscale={}'.format(lmtype, wer, err, num, lmscale) print(' ' + s) fwer.write(s + '\n') fwer.flush() fwer.close()
def main(): print(sys.argv) if len(sys.argv) == 1: print('\"python run_ngram.py -train\" train \n', '\"python run_ngram.py -rescore\" rescore nbest\n', '\"python run_ngram.py -wer\" compute WER') absdir = os.getcwd() + '/' bindir = absdir + '../../tools/srilm/' workdir = absdir + 'ngramlm/' wb.mkdir(workdir) datas = [absdir + i for i in data()] result_file = absdir + 'models_ppl.txt' # the result file model = ngram.model(bindir, workdir) order_reg = [2, 3, 4, 5] for order in order_reg: write_model = workdir + '{}gram.lm'.format(order) print(write_model) if '-train' in sys.argv: if order_reg.index(order) == 0: model.prepare(datas[0], datas[1], datas[2]) model.train(order, write_model, absdir + 'models_ppl.txt') if '-rescore' in sys.argv: model.rescore(write_model, order, datas[3], write_model[0:-3] + '.lmscore') if '-wer' in sys.argv: [nbest, templ] = datas[3:5] lmscore = wb.LoadScore(write_model[0:-3] + '.lmscore') acscore = wb.LoadScore(datas[5]) [wer, lmscale, acscale] = wb.TuneWER(nbest, templ, lmscore, acscore, np.linspace(0.1, 0.9, 9)) print('wer={} lmscale={} acscale={}'.format(wer, lmscale, acscale)) fres = wb.FRes(result_file) fres.AddWER('KN{}'.format(order), wer) trans_txt = workdir + os.path.split(templ)[-1] + '.txt' wb.file_rmlabel(templ, trans_txt) PPL_temp = model.ppl(write_model, order, trans_txt) LL_temp = -wb.PPL2LL(PPL_temp, trans_txt) fres.Add('KN{}'.format(order), ['LL-wsj', 'PPL-wsj'], [LL_temp, PPL_temp])
def main(config, tap=True): dist = config['DIST'] assert dist dist = home_fn(dist) bin = os.path.join(dist, 'bin') i386 = os.path.join(dist, 'i386') amd64 = os.path.join(dist, 'amd64') # build dist and subdirectories rm_rf(dist) mkdir(dist) mkdir(bin) if tap: mkdir(i386) mkdir(amd64) # copy openvpn.exe and manifest cp(home_fn('openvpn.exe'), bin) cp(home_fn('openvpn.exe.manifest'), bin) # copy DLL dependencies cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll'), bin) cp(home_fn(config['OPENSSL_DIR']+'/bin/libeay32.dll'), bin) cp(home_fn(config['OPENSSL_DIR']+'/bin/ssleay32.dll'), bin) # copy MSVC CRT cp_a(home_fn(config['MSVC_CRT']), bin) if tap: # copy TAP drivers for dir_name, dest in (('amd64', amd64), ('i386', i386)): dir = home_fn(os.path.join('tap-win32', dir_name)) for dirpath, dirnames, filenames in os.walk(dir): for f in filenames: root, ext = os.path.splitext(f) if ext in ('.inf', '.cat', '.sys'): cp(os.path.join(dir, f), dest) break # copy tapinstall dest = {'amd64' : amd64, 'i386' : i386} for dirpath, dirnames, filenames in os.walk(home_fn('tapinstall')): for f in filenames: if f == 'tapinstall.exe': dir_name = os.path.basename(dirpath) src = os.path.join(dirpath, f) if dir_name in dest: cp(src, dest[dir_name])
def main(config, tap=True): dist = config['DIST'] assert dist dist = home_fn(dist) bin = os.path.join(dist, 'bin') i386 = os.path.join(dist, 'i386') amd64 = os.path.join(dist, 'amd64') # build dist and subdirectories rm_rf(dist) mkdir(dist) mkdir(bin) if tap: mkdir(i386) mkdir(amd64) # copy openvpn.exe and manifest cp(home_fn('openvpn.exe'), bin) cp(home_fn('openvpn.exe.manifest'), bin) # copy DLL dependencies cp(home_fn(config['LZO_DIR'] + '/bin/lzo2.dll'), bin) cp(home_fn(config['OPENSSL_DIR'] + '/bin/libeay32.dll'), bin) cp(home_fn(config['OPENSSL_DIR'] + '/bin/ssleay32.dll'), bin) # copy MSVC CRT cp_a(home_fn(config['MSVC_CRT']), bin) if tap: # copy TAP drivers for dir_name, dest in (('amd64', amd64), ('i386', i386)): dir = home_fn(os.path.join('tap-win32', dir_name)) for dirpath, dirnames, filenames in os.walk(dir): for f in filenames: root, ext = os.path.splitext(f) if ext in ('.inf', '.cat', '.sys'): cp(os.path.join(dir, f), dest) break # copy tapinstall dest = {'amd64': amd64, 'i386': i386} for dirpath, dirnames, filenames in os.walk(home_fn('tapinstall')): for f in filenames: if f == 'tapinstall.exe': dir_name = os.path.basename(dirpath) src = os.path.join(dirpath, f) if dir_name in dest: cp(src, dest[dir_name])
def __init__(self, bindir, workdir): self.workdir = wb.folder(workdir) self.bindir = wb.folder(bindir) wb.mkdir(workdir)
def main(config, tap=True): dist = config['DIST'] assert dist dist = home_fn(dist) bin = os.path.join(dist, 'bin') i386 = os.path.join(dist, 'i386') amd64 = os.path.join(dist, 'amd64') samples = os.path.join(dist, 'samples') # build dist and subdirectories rm_rf(dist) mkdir(dist) mkdir(bin) mkdir(i386) mkdir(amd64) mkdir(samples) # copy openvpn.exe, openvpnserv.exe and their manifests cp(home_fn('openvpn.exe'), bin) cp(home_fn('openvpn.exe.manifest'), bin) cp(home_fn('service-win32/openvpnserv.exe'), bin) cp(home_fn('service-win32/openvpnserv.exe.manifest'), bin) # copy openvpn-gui cp(home_fn(config['OPENVPN_GUI_DIR']+"/"+config['OPENVPN_GUI']), bin) # copy DLL dependencies cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll'), bin) cp(home_fn(config['LZO_DIR']+'/bin/lzo2.dll.manifest'), bin) cp(home_fn(config['OPENSSL_DIR']+'/bin/libeay32.dll'), bin) cp(home_fn(config['OPENSSL_DIR']+'/bin/ssleay32.dll'), bin) cp(home_fn(config['PKCS11_HELPER_DIR']+'/lib/libpkcs11-helper-1.dll'), bin) cp(home_fn(config['PKCS11_HELPER_DIR']+'/lib/libpkcs11-helper-1.dll.manifest'), bin) # copy OpenSSL utilities (=openvpn.exe) cp(home_fn(config['OPENSSL_DIR']+'/bin/openssl.exe'), bin) # copy sample config files; renaming is necessary due to openvpn.nsi script cp(home_fn('install-win32/sample.ovpn'), samples) cp(home_fn('sample-config-files/client.conf'), samples) cp(home_fn('sample-config-files/server.conf'), samples) rename(os.path.join(samples,'client.conf'), os.path.join(samples, 'client.ovpn')) rename(os.path.join(samples,'server.conf'), os.path.join(samples, 'server.ovpn')) # embed manifests to executables and DLLs for f in [ "openvpn.exe", "openvpnserv.exe", "lzo2.dll", "libpkcs11-helper-1.dll" ]: outputresource = os.path.join(bin,f) manifest = outputresource+".manifest" # EXEs and DLLs require slightly different treatment if f.endswith(".exe"): type = "1" elif f.endswith(".dll"): type = "2" else: print "ERROR: Could not embed manifest to "+outputresouce+", bailing out." sys.exit(1) # Embed the manifest run_in_vs_shell('mt.exe -manifest %s -outputresource:%s;%s' % (manifest, outputresource, type)) # copy MSVC CRT cp_a(home_fn(config['MSVC_CRT']), bin) # TAP-driver and tapinstall.exe were built, so copy those over if tap: drv_dir = 'tap-win32' ti_dir = 'tapinstall' # we're using prebuilt TAP-driver and tapinstall.exe elif 'TAP_PREBUILT' in config: drv_dir = config['TAP_PREBUILT'] ti_dir = config['TAP_PREBUILT'] else: print "ERROR: Could not find prebuilt TAP-drivers or tapinstall.exe. Please check win/settings.in" sys.exit(1) # copy TAP drivers for dir_name, dest in (('amd64', amd64), ('i386', i386)): dir = home_fn(os.path.join(drv_dir, dir_name)) for dirpath, dirnames, filenames in os.walk(dir): for f in filenames: root, ext = os.path.splitext(f) if ext in ('.inf', '.cat', '.sys'): cp(os.path.join(dir, f), dest) break # Copy tapinstall.exe (usually known as devcon.exe) dest = {'amd64' : amd64, 'i386' : i386} for dirpath, dirnames, filenames in os.walk(home_fn(ti_dir)): for f in filenames: if f in ( 'devcon.exe', 'tapinstall.exe' ): dir_name = os.path.basename(dirpath) src = os.path.join(dirpath, f) dst = os.path.join(dest[dir_name],'tapinstall.exe') if dir_name in dest: cp(src, dst, dest_is_dir=False)
def __init__(self, workdir): self.workdir = wb.folder(workdir) self.net = None wb.mkdir(workdir)
write_lmscore = outdir + 'lmwt.lstm' lstm.rescore(workdir, nbest_txt, write_lmscore, config) if __name__ == '__main__': print(sys.argv) if len(sys.argv) == 1: print( ' \"python run.py -train\" train LSTM\n \"python run.py -rescore\" rescore nbest\n \"python run.py -wer\" compute WER') absdir = os.getcwd() + '/' train = absdir + 'data/train' valid = absdir + 'data/valid' nbestdir = absdir + 'data/nbest/nbest_mvdr_single_heq_multi/' workdir = absdir + 'lstmlm/' wb.mkdir(workdir) os.chdir('../../tools/lstm/') config = '-hidden 500 -epoch 10 -dropout 0 -gpu 2' if '-train' in sys.argv: lstm.train(workdir, train, valid, valid, config) if '-test' in sys.argv: lstm.ppl(workdir, train, config) lstm.ppl(workdir, valid, config) if '-rescore' in sys.argv: rescore_all(workdir, nbestdir, config) if '-wer' in sys.argv: lmpaths = {'KN5': nbestdir + '<tsk>/lmwt.lmonly', 'RNN': nbestdir + '<tsk>/lmwt.rnn', 'LSTM': workdir + nbestdir.split('/')[-2] + '/<tsk>/lmwt.lstm',
def main(): if len(sys.argv) == 1: print('\"python run.py -train\" train LSTM\n', '\"python run.py -rescore\" rescore nbest\n', '\"python run.py -wer\" compute WER') bindir = '../../tools/trf/bin/' workdir = 'trflm/' fres = wb.FRes('models_ppl.txt') model = trf.model(bindir, workdir) nbest_root = 'data/nbest/' nbest_type_list = ['nbest_mvdr_single_heq_multi'] class_num = 200 train = workdir + 'train.id' valid = workdir + 'valid.id' test = workdir + 'test.id' vocab = workdir + 'vocab_c{}.list'.format(class_num) order = 4 feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs' #feat = 'g4_w_c_ws_cs_wsh_csh.fs' maxlen = 0 tmax = 20000 t0 = 0 minibatch = 100 gamma_lambda = '3000,0' gamma_zeta = '0,0.6' reg = 1e-6 thread = 8 write_model = workdir + 'trf_c{}_{}_2'.format(class_num, feat[0:-3]) if '-train' in sys.argv or '-all' in sys.argv: config = '-vocab {} -train {} -valid {} -test {} '.format( vocab, train, valid, test) config += ' -read {}.model'.format(write_model[0:-2]) config += ' -order {} -feat {} '.format(order, feat) config += ' -len {} '.format(maxlen) config += ' -write {0}.model -log {0}.log '.format(write_model) config += ' -t0 {} -iter {}'.format(t0, tmax) config += ' -gamma-lambda {} -gamma-zeta {}'.format( gamma_lambda, gamma_zeta) config += ' -L2 {} '.format(reg) config += ' -mini-batch {} '.format(minibatch) config += ' -thread {} '.format(thread) config += ' -print-per-iter 10 ' config += ' -write-at-iter [{}:10000:{}]'.format( tmax - 30000, tmax) # output the intermediate models model.prepare('data/train', 'data/valid', 'data/valid', class_num) model.train(config) if '-plot' in sys.argv: baseline = fres.Get('KN5') trf.PlotLog([write_model], [baseline]) if '-rescore' in sys.argv or '-all' in sys.argv: for nbest_type in nbest_type_list: nbest_dir = nbest_root + nbest_type + '/' for tsk in [ 'nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05'] for b in ['real', 'simu'] ]: write_dir = workdir + nbest_type + '/' + tsk + '/' wb.mkdir(write_dir) print('{} : {}'.format(nbest_type, tsk)) print(' write -> {}'.format(write_dir)) write_lmscore = write_dir + os.path.split(write_model)[-1] # fill the empty lines process_nbest(nbest_dir + tsk + '/words_text', write_lmscore + '.nbest') config = ' -vocab {} '.format(vocab) config += ' -read {}.model '.format(write_model) config += ' -nbest {} '.format(write_lmscore + '.nbest') config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format( write_lmscore) model.use(config) if '-wer' in sys.argv or '-all' in sys.argv: for nbest_type in nbest_type_list: nbest_dir = nbest_root + nbest_type + '/' lmpaths = { 'KN5': nbest_dir + '<tsk>/lmwt.lmonly', 'RNN': nbest_dir + '<tsk>/lmwt.rnn', 'LSTM': 'lstm/' + nbest_type + '/<tsk>/lmwt.lstm', 'TRF': workdir + nbest_type + '/<tsk>/' + os.path.split(write_model)[-1] + '.lmscore' } # 'TRF': nbestdir + '<tsk>/lmwt.trf'} # lmtypes = ['LSTM', 'KN5', 'RNN', 'TRF', 'RNN+KN5', 'LSTM+KN5', 'RNN+TRF', 'LSTM+TRF'] lmtypes = ['TRF', 'RNN', 'KN5', 'RNN+TRF'] wer_workdir = 'wer/' + nbest_type + '/' print('wer_workdir = ' + wer_workdir) wer.wer_all(wer_workdir, nbest_dir, lmpaths, lmtypes) config = wer.wer_tune(wer_workdir) wer.wer_print(wer_workdir, config)