예제 #1
0
def main():

    data = reader.Data().load_raw_data(reader.ptb_raw_dir(),
                                       add_beg_token='<s>',
                                       add_end_token='</s>')
    nbest = reader.NBest(*reader.wsj0_nbest())

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'

    if wb.is_window():
        bindir = 'd:\\wangbin\\tools'
    else:
        bindir = '../../tools/srilm'

    order_reg = [3, 4, 5]
    for order in order_reg:
        config.order = order
        workdir = 'ngramlm/' + str(config)
        m = ngramlm.Model(config, data, bindir, workdir, name=str(config))

        print('train...')
        m.train(write_to_res=(res_file, str(config)))

        print('rescore...')
        nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
        nbest.write_lmscore(os.path.join(workdir, 'nbest.lmscore'))
        wer = nbest.wer()
        print('wer={} lmscale={}, acscale={}'.format(wer, nbest.lmscale,
                                                     nbest.acscale))
        fres.AddWER(str(config), wer)
예제 #2
0
def main():

    data = reader.Data().load_raw_data([train, valid, test],
                                       add_beg_token='<s>',
                                       add_end_token='</s>')
    nbest = reader.NBest(*nbest_files)

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'
    config.discount = '-wbdiscount'

    if wb.is_window():
        bindir = 'd:\\wangbin\\tools'
    else:
        bindir = '../../tools/srilm'
    workdir = 'ngramlm/' + str(config)

    order_reg = [3, 4, 5, 6]
    for order in order_reg:
        config.order = order
        m = ngramlm.Model(config, data, bindir, workdir, name=str(config))

        print('train...')
        m.train(write_to_res=(res_file, str(config)))

        print('rescore...')
        nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
        wer = nbest.wer()
        print('wer={} lmscale={}, acscale={}'.format(wer, nbest.lmscale,
                                                     nbest.acscale))
        fres.AddWER(str(config), wer)
예제 #3
0
def main():

    data = reader.Data().load_raw_data(train_files,
                                       add_beg_token='<s>',
                                       add_end_token='</s>')
    # nbest_real = reader.NBest(*reader.wsj0_nbest())
    # nbest_fake = reader.NBest(*nbest_files)

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'

    if wb.is_window():
        bindir = 'd:\\wangbin\\tools'
    else:
        bindir = '../../tools/srilm'

    order_reg = [5, 6]
    for order in order_reg:
        config.order = order
        workdir = 'ngramlm/' + str(config)
        m = ngramlm.Model(config, data, bindir, workdir, name=str(config))

        print('train...')
        m.train(write_to_res=(res_file, str(config)))

        print('rescore...')
        nbest_eval(m, data, workdir, fres, str(config))
예제 #4
0
def main():

    data = seq.Data(vocab_files=data_info['vocab'],
                    train_list=data_info['train'],
                    valid_list=data_info['valid'],
                    test_list=data_info['test'])

    data = data.create_data()
    nbest = reader.NBest(*data_info['nbest'])

    config = ngramlm.Config(data)
    config.res_file = res_file

    if wb.is_window():
        bindir = 'd:\\wangbin\\tools'
    else:
        bindir = '../../../tools/srilm'
    workdir = 'ngramlm/' + str(config)

    order_reg = [3, 4, 5, 6]
    for order in order_reg:
        config.order = order
        m = ngramlm.Model(config, data, bindir, workdir, name=str(config))

        print('train...')
        m.train(write_to_res=(res_file, str(config)))

        print('rescore...')
        nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
        wer = nbest.wer()
        print('wer={} lmscale={}, acscale={}'.format(wer, nbest.lmscale,
                                                     nbest.acscale))
        fres.AddWER(str(config), wer)
예제 #5
0
def main():
    train_files = 100
    data = reader.LargeData().dynamicly_load_raw_data(
        sorted_vocab_file=data_info['vocab_cut3'],
        train_list=data_info['train_all'][0:train_files],
        valid_file=data_info['valid'],
        test_file=data_info['test'],
        add_beg_token='<s>',
        add_end_token='</s>',
        add_unknwon_token='<unk>',
        vocab_max_size=None)

    nbest = reader.NBest(*reader.wsj0_nbest())

    config = ngramlm.Config(data)
    config.res_file = 'results.txt'

    order_reg = [5]
    for order in order_reg:
        config.order = order
        config.cutoff = [0, 0, 2, 2, 5]

        model_name = 't{}_'.format(train_files) + str(config)
        workdir = 'ngramlm/' + model_name
        sys.stdout = wb.std_log(os.path.join(workdir, 'ngram.log'))
        datadir = 'ngramlm/data/'
        m = ngramlm.Model(config,
                          data,
                          bindir,
                          workdir,
                          datadir,
                          name=model_name)

        print('train...')
        with wb.processing('training'):
            m.train(write_to_res=(res_file, model_name))

        print('rescore...')
        with wb.processing('rescoring'):
            nbest.lmscore = m.rescore(nbest.get_nbest_list(data))
        nbest.write_lmscore(os.path.join(workdir, 'nbest.lmscore'))
        wer = nbest.wer()
        print('wer={} lmscale={}, acscale={}'.format(wer, nbest.lmscale,
                                                     nbest.acscale))
        fres.AddWER(model_name, wer)