예제 #1
0
    def ppl(self, vocab, model, txt, isnbest=False):
        list_vocab = self.workdir + 'vocab.list'
        write_id = self.workdir + os.path.split(txt)[-1] + '.pplid'
        if isnbest:
            trf.NbestToID(txt, write_id, trf.ReadVocab(list_vocab))
        else:
            trf.CorpusToID(txt, write_id, trf.ReadVocab(list_vocab))

        cmd = self.bindir + 'hrf '
        cmd += ' -vocab {} -read {} -test {}'.format(vocab, model, write_id)
        s = os.popen(cmd).read()
        idx = s.find('-LL = ')
        if idx != -1:
            LL = float(s[idx:].split()[2])
            return wb.LL2PPL(-LL, write_id)
        else:
            print('[ERROR]!!')
            print(s)
예제 #2
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER'
              )

    run_times = range(0, 10)   # for multiple run

    
    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    order = 4
    feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
    #feat = 'g4_w_c_ws_cs_cpw.fs'
    maxlen = 0
    tmax = 20000
    t0 = 2000
    minibatch = 100
    gamma_lambda = '3000,0'
    gamma_zeta = '0,0.6'
    reg = 4e-5
    thread = 8

    if '-res' in sys.argv:
        fres.Read()
        for i in range(1,len(fres.head)):
            value = []
            for runnum in run_times:
                write_name = 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)
                line = fres.Get(write_name)
                value.append(line[i])
            fres.Add('trf_c{}_{}.runavg'.format(class_num, feat[0:-3]), [fres.head[i]],
                     ['{:.2f}+{:.2f}'.format(np.mean(value), np.std(value))] )

    for runnum in run_times:
        write_model = workdir + 'trf_c{}_{}.run{}'.format(class_num, feat[0:-3], runnum)

        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 '
            config += ' -write-at-iter [{}:1000:{}]'.format(tmax-5000, tmax)  # output the intermediate models
            model.prepare(data()[0], data()[1], data()[2], class_num)
            model.train(config)
        if '-plot' in sys.argv:
            baseline = fres.Get('KN5')
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data()[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore, read_lmscore] = data()[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore), np.linspace(0.1,0.9,9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            LL = model.get_last_value(write_model + '.log')

            # output the result
            name = os.path.split(write_model)[1]
            fres.AddLL(name, LL, data()[0:3])
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(name, wer)
        if '-stat' in sys.argv:
            # calculate the mean and var of wers of the intermediate models
            inte_wer = []
            inte_model = []

            # find model
            for file_name in os.listdir(os.path.split(write_model)[0]):
                file_path = os.path.split(write_model)[0] + os.sep + file_name
                if not os.path.isfile(file_path):
                    continue
                if file_name.find(os.path.split(write_model)[1]) == 0 and \
                    file_path.split('.')[-1] == 'model' and \
                    file_path.split('.')[-2][0] == 'n':
                    inte_model.append(file_path)

            # compute wer
            flog = open(workdir + 'inte_model_wer.log', 'wt')
            for file_path in sorted(inte_model):
                print(file_path)
                t = int(file_path.split('.')[-2][1:])

                # lmscore
                write_lmscore = os.path.splitext(file_path)[0] + '.lmscore'
                config = ' -vocab {} '.format(vocab)
                config += ' -read {} '.format(file_path)
                config += ' -nbest {} '.format(data()[3])
                config += ' -lmscore {0} '.format(write_lmscore)
                model.use(config, False)
                # wer
                [wer, lmscale, acscale] = wb.TuneWER(data()[3], data()[4],
                                                 wb.LoadScore(write_lmscore),
                                                 wb.LoadScore(data()[5]), np.linspace(0.1, 0.9, 9))
                print('t={} wer={}'.format(t, wer))
                flog.write('{} \t wer={}\n'.format(file_path, wer))
                inte_wer.append([t, wer])
            flog.close()

            # plot wer
            inte_wer = sorted(inte_wer, key=lambda d: d[0])
            t_list = [i[0] for i in inte_wer]
            wer_list = [i[1] for i in inte_wer]
            wer_mean = np.mean(wer_list[-20:])
            wer_std = np.std(wer_list[-20:])
            print('wer_mean={}  wer_std={}'.format(wer_mean, wer_std))

            plt.figure()
            plt.plot(t_list, wer_list)
            plt.xlabel('t')
            plt.ylabel('wer')
            plt.show()
        if '-ais' in sys.argv:
            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_chain = 10
            ais_inter = 10000
            ais_model = '{}.ais{}_{}.model'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(trf.FileMaxLen(read_nbest)-1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore, np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0]*3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(write_model)[1]+":AIS{}-{}".format(ais_chain, ais_inter)
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
예제 #3
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    run_times = range(0, 1)  # for multiple run

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    thread = 18

    ais_chain = 10
    ais_inter = 200000

    if '-wer' in sys.argv:
        # calculate mean of the WER of 10 TRFs after AIS
        res_list = []
        for runnum in run_times:
            name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais{}_{}'.format(
                runnum, ais_chain, ais_inter)
            res = fres.Get(name)[1:]
            if run_times.index(runnum) == 0:
                res_list = [[] for i in range(len(res))]
            for i in range(len(res)):
                res_list[i].append(res[i])
        name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais{}_{}'.format(
            ais_chain, ais_inter)
        head = fres.GetHead()[1:]
        for i in range(len(head)):
            mean = np.mean(res_list[i])
            std = np.std(res_list[i])
            fres.Add(name, [head[i]], ['{:.2f}+{:.2f}'.format(mean, std)])

    if '-ais' in sys.argv:
        for runnum in run_times:
            write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}'.format(
                runnum)

            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_model = '{}.ais{}_{}'.format(write_model, ais_chain, ais_inter)
            if not os.path.exists(ais_model + '.model'):
                config = ' -vocab {0} -read {1}.model -write {2}.model -log {2}.log'.format(
                    vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(
                    ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(
                    trf.FileMaxLen(read_nbest) -
                    1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = ais_model + '.lmscore'
            config = ' -vocab {} -read {}.model'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(
                read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore,
                                                 np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0] * 3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {}.model -test {} '.format(
                    vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.split(ais_model)[-1]
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])

    if '-cmp' in sys.argv:
        # compare the variance of exp(logz) with the variance of AIS weight
        # Load the logz of 10 independent runs
        multi_run = 10
        logzs = []
        for i in range(multi_run):
            logz = trf.LoadLogz(
                workdir +
                'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.run{}.model'
                .format(i))
            logzs.append(logz[0:33])
        mat_logzs = np.matrix(logzs).T

        # Load the weight of each length
        logws = []
        with open(workdir +
                  'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.log'
                  ) as f:
            for line in f:
                idx = line.find('logw=')
                if idx != -1:
                    a = line[idx:].split()[1:]
                    logws.append([float(i) for i in a])
        mat_logws = np.matrix(logws)

        w_var = mat_var(mat_logws)
        z_var = mat_var(mat_logzs)

        for i in range(len(w_var)):
            rate = np.exp(w_var[i] - z_var[i])
            print('len={} w_var={} z_var={} rate={}'.format(
                i + 1, w_var[i], z_var[i], rate))
    if '-cmp2' in sys.argv:
        # compare the logz of AIS and the SAMS
        write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'
        logz_sams = trf.LoadLogz(write_model + '.model')
        logz_ais = trf.LoadLogz('{}.ais{}_{}.model'.format(
            write_model, ais_chain, ais_inter))
        plt.figure()
        plt.plot(logz_sams[0:33], 'r-', label='sams')
        logz_ais10 = []
        for n in range(10):
            logz_ais10.append(
                trf.LoadLogz('{}.ais10_20000.run{}.model'.format(
                    write_model, n)))
            plt.plot(logz_ais10[-1][0:33], 'g--')
        logz_ais_m = [0] * 33
        for i in range(33):
            for n in range(10):
                logz_ais_m[i] += logz_ais10[n][i]
            logz_ais_m[i] /= 10
        plt.plot(logz_ais_m[0:33], 'r--')
        plt.plot(logz_ais[0:33], 'b--', label='ais 10-200K')
        #plt.legend()
        plt.show()

    if '-cmp3' in sys.argv:
        trf_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'
        # revise the logz of the trf model to the mean of results of 10 (10-20k) runs
        logz_sams = trf.LoadLogz(trf_model + '.model')
        logz_ais10 = []
        for n in range(10):
            logz_ais10.append(
                trf.LoadLogz('{}.ais10_20000.run{}.model'.format(trf_model,
                                                                 n)))
        logz_ais_m = [0] * 33
        for i in range(33):
            for n in range(10):
                logz_ais_m[i] += logz_ais10[n][i]
            logz_ais_m[i] /= 10

        print(logz_ais_m)
        ais_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais10_20000.runavg.model'
        print('write -> ' + ais_model)
        revise_logz(trf_model + '.model', ais_model, logz_ais_m)

        # compute WER
        print('computer WER')
        wer = model.wer(vocab, ais_model, data()[3], data()[4], data()[5])
        print('WER={}'.format(wer))

        # compute PPL
        print('computer PPL')
        ppl = model.ppl(vocab, ais_model, data()[4], True)
        print('PPL={}'.format(ppl))

        # plot the logzs
        plt.figure()
        for n in range(10):
            plt.plot(logz_ais10[n][0:33], 'g-')
        plt.plot(logz_ais_m[0:33], 'r', label='ais10-20K-mean')
        plt.plot(logz_sams[0:33], 'b', label='sams')
        plt.legend()
        plt.show()

    if '-wer3' in sys.argv:
        # smooth zeta
        wer_ais = []
        wer_smooth = []
        ppl_ais = []
        ppl_smooth = []
        ll_ais = []
        ll_smooth = []
        for n in range(10):
            ais_name = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais10_20000'.format(
                n)
            print(ais_name)

            logz_ais = trf.LoadLogz(ais_name + '.model')[0:33]
            z = np.polyfit(np.linspace(1, 33, 33), logz_ais, 1)
            logz_ais_smooth = z[0] * np.linspace(1, 33, 33) + z[1]
            revise_logz(ais_name + '.model', ais_name + '.smooth.model',
                        logz_ais_smooth.tolist())
            print(logz_ais)
            print(logz_ais_smooth.tolist())

            if n == 0:
                logz_sams = trf.LoadLogz(
                    workdir +
                    'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.model'.format(n))
                logw = load_ais_weight(ais_name + '.log')
                plt.figure()
                for i in range(len(logw)):
                    plt.plot((i + 1) * np.ones(len(logw[i])), logw[i], 'k.')
                plt.plot(np.linspace(1, 33, 33),
                         logz_ais,
                         'r-',
                         label='standard AIS')
                plt.plot(np.linspace(1, 33, 33),
                         logz_ais_smooth,
                         'g-',
                         label='smoothed AIS')
                plt.plot(np.linspace(1, 33, 33),
                         logz_sams[0:33],
                         'b-',
                         label='SAMS')
                plt.legend()
                plt.xlim(1, 33)
                plt.xlabel('length')
                plt.ylabel('logZ')
                plt.show()

            wer = model.wer(vocab, ais_name + '.model',
                            data()[3],
                            data()[4],
                            data()[5])
            [ppl, LL] = model.ppl(vocab, ais_name + '.model', data()[4], True)
            wer_ais.append(wer)
            ppl_ais.append(ppl)
            ll_ais.append(LL)

            fres.Add(
                os.path.split(ais_name)[-1], ['WER', 'LL-wsj', 'PPL-wsj'],
                [wer, LL, ppl])

            wer = model.wer(vocab, ais_name + '.smooth.model',
                            data()[3],
                            data()[4],
                            data()[5])
            [ppl, LL] = model.ppl(vocab, ais_name + '.smooth.model',
                                  data()[4], True)
            wer_smooth.append(wer)
            ppl_smooth.append(ppl)
            ll_smooth.append(LL)

            fres.Add(
                os.path.split(ais_name)[-1] + '.smooth',
                ['WER', 'LL-wsj', 'PPL-wsj'], [wer, LL, ppl])

        for label, d in zip(['WER', 'LL-wsj', 'PPL-wsj'],
                            [wer_ais, ll_ais, ppl_ais]):
            cur_mean = np.mean(d)
            cur_std = np.std(d)
            fres.Add('trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000',
                     [label], ['{:.2f}+{:.2f}'.format(cur_mean, cur_std)])

        for label, d in zip(['WER', 'LL-wsj', 'PPL-wsj'],
                            [wer_smooth, ll_smooth, ppl_smooth]):
            cur_mean = np.mean(d)
            cur_std = np.std(d)
            fres.Add(
                'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000.smooth',
                [label], ['{:.2f}+{:.2f}'.format(cur_mean, cur_std)])

    if '-wer2' in sys.argv:
        # perform adjust-AIS and  evaluate the WER and PPL

        results = []
        for n in range(10):
            ais_name = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run{}.ais10_20000'.format(
                n)
            print(ais_name)
            logw = load_ais_weight(ais_name + '.log')
            logz = [np.mean(a) for a in logw]
            revise_logz(ais_name + '.model', ais_name + '.adjust.model', logz)
            print('  wer')
            wer = model.wer(vocab, ais_name + '.adjust.model',
                            data()[3],
                            data()[4],
                            data()[5])
            print('  ppl')
            [ppl, LL] = model.ppl(vocab, ais_name + '.adjust.model',
                                  data()[4], True)
            fres.Add(
                os.path.split(ais_name)[-1] + '.ad',
                ['WER', 'LL-wsj', 'PPL-wsj'], [wer, LL, ppl])
            results.append([wer, LL, ppl])

        res_mean = []
        res_std = []
        for i in range(3):
            a = [b[i] for b in results]
            res_mean.append(np.mean(a))
            res_std.append(np.std(a))
        fres.Add('trf_c200_g4_w_c_ws_cs_wsh_csh_tied.runavg.ais10_20000.ad', [
            'WER', 'LL-wsj', 'PPL-wsj'
        ], ['{:.2f}+{:.2f}'.format(res_mean[i], res_std[i]) for i in range(3)])
예제 #4
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    run_times = range(3, 10)  # for multiple run

    bindir = '../../tools/trf/bin/'
    workdir = 'trflm/'
    fres = wb.FRes('models_ppl.txt')
    model = trf.model(bindir, workdir)

    class_num = 200
    train = workdir + 'train.id'
    valid = workdir + 'valid.id'
    test = workdir + 'test.id'
    vocab = workdir + 'vocab_c{}.list'.format(class_num)
    thread = 8

    ais_chain = 10
    ais_inter = 20000

    if '-wer' in sys.argv:
        res_list = []
        for runnum in run_times:
            name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais{}_{}.run{}'.format(
                ais_chain, ais_inter, runnum)
            res = fres.Get(name)[1:]
            if run_times.index(runnum) == 0:
                res_list = [[] for i in range(len(res))]
            for i in range(len(res)):
                res_list[i].append(res[i])
        name = 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0.ais{}_{}.avg'.format(
            ais_chain, ais_inter)
        head = fres.GetHead()[1:]
        for i in range(len(head)):
            mean = np.mean(res_list[i])
            std = np.std(res_list[i])
            fres.Add(name, [head[i]], ['{:.2f}+{:.2f}'.format(mean, std)])

    if '-ais' in sys.argv:
        for runnum in run_times:
            write_model = workdir + 'trf_c200_g4_w_c_ws_cs_wsh_csh_tied.run0'

            [read_nbest, read_templ, read_acscore] = data()[3:6]
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)

            # run asi to calculate the normalization constnts of models
            ais_model = '{}.ais{}_{}.run{}.model'.format(
                write_model, ais_chain, ais_inter, runnum)
            if not os.path.exists(ais_model):
                config = ' -vocab {0} -read {1}.model -write {2}'.format(
                    vocab, write_model, ais_model)
                config += ' -norm-method AIS -AIS-chain {} -AIS-inter {} -thread {} '.format(
                    ais_chain, ais_inter, thread)
                config += ' -norm-len-max {} '.format(
                    trf.FileMaxLen(read_nbest) -
                    1)  # just compute the needed length
                model.use(config)

            # rescore and compute wer
            write_lmscore = os.path.splitext(ais_model)[0] + '.lmscore'
            config = ' -vocab {} -read {}'.format(vocab, ais_model)
            config += ' -nbest {} -test {} '.format(
                read_nbest, write_templ_id)  # calculate the ppl of test set
            config += ' -lmscore {} '.format(write_lmscore)
            LL_templ = model.use(config, False)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)
            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 write_lmscore, read_acscore,
                                                 np.linspace(0.1, 0.9, 9))
            # calculate the LL of train/valid/test
            LL = [0] * 3
            id_data = [train, valid, test]  # are id files
            for i in range(3):
                config = ' -vocab {} -read {} -test {} '.format(
                    vocab, ais_model, id_data[i])
                LL[i] = model.use(config, False)

            # write to res file
            name = os.path.splitext(os.path.split(ais_model)[-1])[0]
            fres.AddLL(name, LL, id_data)
            fres.AddWER(name, wer)
            fres.Add(name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
예제 #5
0
def main():
    if len(sys.argv) == 1:
        print('\"python run.py -train\" train LSTM\n',
              '\"python run.py -rescore\" rescore nbest\n',
              '\"python run.py -wer\" compute WER')

    for tsize in [1]:
        bindir = '../../tools/trf/bin/'
        tskdir = '{}/'.format(tsize)
        workdir = tskdir + 'trflm/'

        fres = wb.FRes('result.txt')
        model = trf.model(bindir, workdir)

        class_num = 200
        train = workdir + 'train.id'
        valid = workdir + 'valid.id'
        test = workdir + 'test.id'
        vocab = workdir + 'vocab_c{}.list'.format(class_num)
        order = 4
        feat = 'g4_w_c_ws_cs_wsh_csh_tied.fs'
        #feat = 'g4_w_c_ws_cs_cpw.fs'
        maxlen = 100
        tmax = 50000
        t0 = 2000
        minibatch = 100
        gamma_lambda = '1000,0'
        gamma_zeta = '0,0.6'
        reg = 1e-6
        thread = 8

        write_model = workdir + 'trf_c{}_{}'.format(class_num, feat[0:-3])
        write_name = '{}:{}'.format(tsize, os.path.split(write_model)[1])

        if '-class' in sys.argv:
            # just cluster for each tsks.
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
        if '-train' in sys.argv or '-all' in sys.argv:
            config = '-vocab {} -train {} -valid {} -test {} '.format(
                vocab, train, valid, test)
            config += ' -order {} -feat {} '.format(order, feat)
            config += ' -len {} '.format(maxlen)
            config += ' -write {0}.model -log {0}.log '.format(write_model)
            config += ' -t0 {} -iter {}'.format(t0, tmax)
            config += ' -gamma-lambda {} -gamma-zeta {}'.format(
                gamma_lambda, gamma_zeta)
            config += ' -L2 {} '.format(reg)
            config += ' -mini-batch {} '.format(minibatch)
            config += ' -thread {} '.format(thread)
            config += ' -print-per-iter 10 -write-at-iter 10000:10000:{}'.format(
                tmax)
            model.prepare(
                data(tskdir)[0],
                data(tskdir)[1],
                data(tskdir)[2], class_num)
            model.train(config)
            # output
            LL = model.get_last_value(write_model + '.log')
            fres.AddLL(write_name, LL, data(tskdir)[0:3])
        if '-plot' in sys.argv:
            baseline = fres.Get('{}:KN5'.format(tsize))
            trf.PlotLog([write_model], [baseline])
        if '-rescore' in sys.argv or '-all' in sys.argv:
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -nbest {} '.format(data(tskdir)[3])
            config += ' -lmscore {0}.lmscore -lmscore-test-id {0}.test-id '.format(
                write_model)
            model.use(config)
        if '-wer' in sys.argv or '-all' in sys.argv:
            [read_nbest, read_templ, read_acscore,
             read_lmscore] = data(tskdir)[3:7]
            read_lmscore = write_model + '.lmscore'

            [wer, lmscale, acscale] = wb.TuneWER(read_nbest, read_templ,
                                                 wb.LoadScore(read_lmscore),
                                                 wb.LoadScore(read_acscore),
                                                 np.linspace(0.1, 0.9, 9))
            print('wer={:.4f} lmscale={:.2f} acscale={:.2f}'.format(
                wer, lmscale, acscale))

            # calculate the ppl on wsj test
            write_templ_id = workdir + os.path.split(read_templ)[1] + '.id'
            v = trf.ReadVocab(vocab)
            trf.NbestToID(read_templ, write_templ_id, v)
            config = ' -vocab {} '.format(vocab)
            config += ' -read {}.model '.format(write_model)
            config += ' -test {} '.format(write_templ_id)
            LL_templ = model.use(config)
            PPL_templ = wb.LL2PPL(-LL_templ, write_templ_id)

            # output the result
            fres.Add(write_name, ['LL-wsj', 'PPL-wsj'], [LL_templ, PPL_templ])
            fres.AddWER(write_name, wer)