Exemplo n.º 1
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER'
              )

    run_times = range(0, 10)   # for multiple run

    
    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_cpw.fs'
    maxlen = 0
    tmax = 20000
    t0 = 2000
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 4e-5
    thread = 8

    if '-res' in sys.argv:
        fres.Read()
        for i in range(1,len(fres.head)):
            value = []
            for runnum in run_times:
                write_name = 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)
                line = fres.Get(write_name)
                value.append(line[i])
            fres.Add('trf_c{}_{}.runavg'.format(class_num, feat[0:-3]), [fres.head[i]],
                     ['{:.2f}+{:.2f}'.format(np.mean(value), np.std(value))] )

    for runnum in run_times:
        write_model = workdir + 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)

        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 '
            config += ' -write-at-iter [{}:1000:{}]'.format(tmax-5000, tmax)  # output the intermediate models
            model.prepare(data()[0], data()[1], data()[2], class_num)
            model.train(config)
        if '-plot' in sys.argv:
            baseline = fres.Get('KN5')
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data()[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore, read_lmscore] = data()[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore), np.linspace(0.1,0.9,9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            LL = model.get_last_value(write_model + '.log')

            # output the result
            name = os.path.split(write_model)[1]
            fres.AddLL(name, LL, data()[0:3])
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(name, wer)
        if '-stat' in sys.argv:
            # calculate the mean and var of wers of the intermediate models
            inte_wer = []
            inte_model = []

            # find model
            for file_name in os.listdir(os.path.split(write_model)[0]):
                file_path = os.path.split(write_model)[0] + os.sep + file_name
                if not os.path.isfile(file_path):
                    continue
                if file_name.find(os.path.split(write_model)[1]) == 0 and \
                    file_path.split('.')[-1] == 'model' and \
                    file_path.split('.')[-2][0] == 'n':
                    inte_model.append(file_path)

            # compute wer
            flog = open(workdir + 'inte_model_wer.log', 'wt')
            for file_path in sorted(inte_model):
                print(file_path)
                t = int(file_path.split('.')[-2][1:])

                # lmscore
                write_lmscore = os.path.splitext(file_path)[0] + '.lmscore'
                config = ' -vocab {} '.format(vocab)
                config += ' -read {} '.format(file_path)
                config += ' -nbest {} '.format(data()[3])
                config += ' -lmscore {0} '.format(write_lmscore)
                model.use(config, False)
                # wer
                [wer, lmscale, acscale] = wb.TuneWER(data()[3], data()[4],
                                                 wb.LoadScore(write_lmscore),
                                                 wb.LoadScore(data()[5]), np.linspace(0.1, 0.9, 9))
                print('t={} wer={}'.format(t, wer))
                flog.write('{} \t wer={}\n'.format(file_path, wer))
                inte_wer.append([t, wer])
            flog.close()

            # plot wer
            inte_wer = sorted(inte_wer, key=lambda d: d[0])
            t_list = [i[0] for i in inte_wer]
            wer_list = [i[1] for i in inte_wer]
            wer_mean = np.mean(wer_list[-20:])
            wer_std = np.std(wer_list[-20:])
            print('wer_mean={}  wer_std={}'.format(wer_mean, wer_std))

            plt.figure()
            plt.plot(t_list, wer_list)
            plt.xlabel('t')
            plt.ylabel('wer')
            plt.show()
        if '-ais' in sys.argv:
            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_chain = 10
            ais_inter = 10000
            ais_model = '{}.ais{}_{}.model'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(trf.FileMaxLen(read_nbest)-1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore, np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0]*3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(write_model)[1]+":AIS{}-{}".format(ais_chain, ais_inter)
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
Exemplo n.º 2
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    nbest_root = 'data/nbest/'
    nbest_type_list = ['nbest_mvdr_single_heq_multi']

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_wsh_csh.fs'
    maxlen = 0
    tmax = 20000
    t0 = 0
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 1e-6
    thread = 8

    write_model = workdir + 'trf_c{}_{}_2'.format(class_num, feat[0:-3])
    if '-train' in sys.argv or '-all' in sys.argv:
        config = '-vocab {} -train {} -valid {} -test {} '.format(
            vocab, train, valid, test)
        config += ' -read {}.model'.format(write_model[0:-2])
        config += ' -order {} -feat {} '.format(order, feat)
        config += ' -len {} '.format(maxlen)
        config += ' -write {0}.model -log {0}.log '.format(write_model)
        config += ' -t0 {} -iter {}'.format(t0, tmax)
        config += ' -gamma-lambda {} -gamma-zeta {}'.format(
            gamma_lambda, gamma_zeta)
        config += ' -L2 {} '.format(reg)
        config += ' -mini-batch {} '.format(minibatch)
        config += ' -thread {} '.format(thread)
        config += ' -print-per-iter 10 '
        config += ' -write-at-iter [{}:10000:{}]'.format(
            tmax - 30000, tmax)  # output the intermediate models
        model.prepare('data/train', 'data/valid', 'data/valid', class_num)
        model.train(config)
    if '-plot' in sys.argv:
        baseline = fres.Get('KN5')
        trf.PlotLog([write_model], [baseline])
    if '-rescore' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            for tsk in [
                    'nbestlist_{}_{}'.format(a, b) for a in ['dt05', 'et05']
                    for b in ['real', 'simu']
            ]:
                write_dir = workdir + nbest_type + '/' + tsk + '/'
                wb.mkdir(write_dir)
                print('{} : {}'.format(nbest_type, tsk))
                print('  write -> {}'.format(write_dir))
                write_lmscore = write_dir + os.path.split(write_model)[-1]
                # fill the empty lines
                process_nbest(nbest_dir + tsk + '/words_text',
                              write_lmscore + '.nbest')

                config = ' -vocab {} '.format(vocab)
                config += ' -read {}.model '.format(write_model)
                config += ' -nbest {} '.format(write_lmscore + '.nbest')
                config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                    write_lmscore)
                model.use(config)
    if '-wer' in sys.argv or '-all' in sys.argv:
        for nbest_type in nbest_type_list:
            nbest_dir = nbest_root + nbest_type + '/'
            lmpaths = {
                'KN5':
                nbest_dir + '<tsk>/lmwt.lmonly',
                'RNN':
                nbest_dir + '<tsk>/lmwt.rnn',
                'LSTM':
                'lstm/' + nbest_type + '/<tsk>/lmwt.lstm',
                'TRF':
                workdir + nbest_type + '/<tsk>/' +
                os.path.split(write_model)[-1] + '.lmscore'
            }
            # 'TRF': nbestdir + '<tsk>/lmwt.trf'}
            # lmtypes = ['LSTM', 'KN5', 'RNN', 'TRF', 'RNN+KN5', 'LSTM+KN5', 'RNN+TRF', 'LSTM+TRF']
            lmtypes = ['TRF', 'RNN', 'KN5', 'RNN+TRF']
            wer_workdir = 'wer/' + nbest_type + '/'
            print('wer_workdir = ' + wer_workdir)
            wer.wer_all(wer_workdir, nbest_dir, lmpaths, lmtypes)
            config = wer.wer_tune(wer_workdir)
            wer.wer_print(wer_workdir, config)
Exemplo n.º 3
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    for tsize in [1]:
        bindir = '../../tools/trf/bin/'
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'trflm/'

        fres = wb.FRes('result.txt')
        model = trf.model(bindir, workdir)

        class_num = 200
        train = workdir + 'train.id'
        valid = workdir + 'valid.id'
        test = workdir + 'test.id'
        vocab = workdir + 'vocab_c{}.list'.format(class_num)
        order = 4
        feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
        #feat = 'g4_w_c_ws_cs_cpw.fs'
        maxlen = 100
        tmax = 50000
        t0 = 2000
        minibatch = 100
        gamma_lambda = '1000,0'
        gamma_zeta = '0,0.6'
        reg = 1e-6
        thread = 8

        write_model = workdir + 'trf_c{}_{}'.format(class_num, feat[0:-3])
        write_name = '{}:{}'.format(tsize, os.path.split(write_model)[1])

        if '-class' in sys.argv:
            # just cluster for each tsks.
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(
                vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(
                gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 -write-at-iter 10000:10000:{}'.format(
                tmax)
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
            model.train(config)
            # output
            LL = model.get_last_value(write_model + '.log')
            fres.AddLL(write_name, LL, data(tskdir)[0:3])
        if '-plot' in sys.argv:
            baseline = fres.Get('{}:KN5'.format(tsize))
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data(tskdir)[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore,
             read_lmscore] = data(tskdir)[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore),
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(
                wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)

            # output the result
            fres.Add(write_name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(write_name, wer)
Exemplo n.º 4
0
def PlotLog(name_pack, baseline=[]):
    trf.PlotLog(name_pack, baseline)