示例#1
0
文件: main.py 项目: cjopengler/lmkit
def test():
    valid_data = TextIterator(valid_datafile,
                              filepath,
                              n_batch=n_batch,
                              brown_or_huffman=brown_or_huffman,
                              mode=matrix_or_vector,
                              word2idx_path=word2idx_path)
    test_data = TextIterator(test_datafile,
                             filepath,
                             n_batch=n_batch,
                             brown_or_huffman=brown_or_huffman,
                             mode=matrix_or_vector,
                             word2idx_path=word2idx_path)
    model = RNNLM(n_input,
                  n_hidden,
                  vocabulary_size,
                  cell,
                  optimizer,
                  p,
                  mode=matrix_or_vector)
    if os.path.isfile(args.model_dir):
        print 'loading pretrained model:', args.model_dir
        model = load_model(args.model_dir, model)
    else:
        print args.model_dir, 'not found'
    mean_cost = evaluate(valid_data, model)
    print 'valid cost:', mean_cost, 'perplexity:', np.exp(
        mean_cost)  #,"word_error_rate:",mean_wer
    mean_cost = evaluate(test_data, model)
    print 'test cost:', mean_cost, 'perplexity:', np.exp(mean_cost)
示例#2
0
def train_by_batch(lr):
    # Load data
    logger.info('loading dataset...')

    train_data = TextIterator(train_datafile, prefix_path=prefix_path, n_batch=n_batch, maxlen=maxlen)
    valid_data = TextIterator(valid_datafile, prefix_path=prefix_path, n_batch=n_batch, maxlen=maxlen)
    test_data = TextIterator(test_datafile, prefix_path=prefix_path, n_batch=n_batch, maxlen=maxlen)

    logger.info('building model...')
    model = RNNLM(n_input, n_hidden, vocab_size, n_class=train_data.n_class, node_len=train_data.node_maxlen,
                  rnn_cell=rnn_cell, optimizer=optimizer, p=p,node_mask_path=node_mask_path)

    if os.path.exists(model_dir) and reload_dumps == 1:
        logger.info('loading parameters from: %s' % model_dir)
        model = load_model(model_dir, model)
    else:
        print "init parameters...."
    if goto_line > 0:
        train_data.goto_line(goto_line)
        print 'goto line:', goto_line
    print 'training start...'
    start = time.time()
    idx = goto_line
    logger.info('training start...')
    for epoch in xrange(NEPOCH):
        error = 0
        for x, x_mask, y_node, y_mask in train_data:
            idx += 1
            #cost, logprob = model.train(x, x_mask, y_node, y_mask, lr)
            cost = model.train(x, x_mask, y_node, y_mask, lr)
            error += cost
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN Or Inf detected!'
                return -1
            if idx % disp_freq == 0:
                logger.info('epoch: %d idx: %d cost: %f ppl: %f' % (
                    epoch, idx, error / disp_freq, np.exp(error / (1.0 * disp_freq))))
                error = 0
            if idx % save_freq == 0:
                print 'dumping...'
                save_model('./model/parameters_%.2f.pkl' % (time.time() - start), model)
            if idx % valid_freq == 0:
                logger.info('validing....')
                valid_cost = evaluate(valid_data, model)
                logger.info('valid_cost: %f perplexity: %f' % (valid_cost, np.exp(valid_cost)))
            if idx % test_freq == 0:
                logger.info('testing...')
                test_cost = evaluate(test_data, model)
                logger.info('test cost: %f perplexity: %f' % (test_cost, np.exp(test_cost)))


        sys.stdout.flush()

    print "Finished. Time = " + str(time.time() - start)
示例#3
0
def test():
    valid_data = TextIterator(valid_datafile, prefix_path=prefix_path, n_batch=n_batch)
    test_data = TextIterator(test_datafile, prefix_path=prefix_path, n_batch=n_batch)
    model = RNNLM(n_input, n_hidden, vocab_size, rnn_cell=rnn_cell, optimizer=optimizer, p=p,
                  n_class=valid_data.n_class, node_len=valid_data.node_maxlen, node_mask_path=node_mask_path)
    if os.path.isfile(args.model_dir):
        print 'loading pretrained model:', args.model_dir
        model = load_model(args.model_dir, model)
    else:
        print args.model_dir, 'not found'
    mean_cost = evaluate(valid_data, model)
    print 'valid cost:', mean_cost, 'perplexity:', np.exp(mean_cost)  # ,"word_error_rate:",mean_wer
    mean_cost = evaluate(test_data, model)
    print 'test cost:', mean_cost, 'perplexity:', np.exp(mean_cost)
示例#4
0
def test():
    test_data = TextIterator(test_datafile, n_batch=n_batch)
    valid_data = TextIterator(valid_datafile, n_batch=n_batch)
    model = RNNLM(n_input, n_hidden, vocabulary_size, rnn_cell, optimizer, p)
    if os.path.isfile(args.model_dir):
        print 'loading pretrained model:', args.model_dir
        model = load_model(args.model_dir, model)
    else:
        print args.model_dir, 'not found'
    mean_cost = evaluate(valid_data, model)
    print 'valid cost:', mean_cost, 'perplexity:', np.exp(
        mean_cost)  #,"word_error_rate:",mean_wer
    mean_cost = evaluate(test_data, model)
    print 'test cost:', mean_cost, 'perplexity:', np.exp(mean_cost)
示例#5
0
def train(lr):
    print 'loading dataset...'
    train_data = TextIterator(train_datafile, n_batch=n_batch, maxlen=maxlen)
    valid_data = TextIterator(valid_datafile, n_batch=n_batch, maxlen=maxlen)
    test_data = TextIterator(test_datafile, n_batch=n_batch, maxlen=maxlen)
    print 'building model...'
    model = RNNLM(n_input, n_hidden, vocabulary_size, rnn_cell, optimizer, p,
                  bptt)
    if os.path.isfile(model_dir):
        print 'loading checkpoint parameters....', model_dir
        model = load_model(model_dir, model)
    if goto_line > 0:
        train_data.goto_line(goto_line)
        print 'goto line:', goto_line
    print 'training start...'
    start = time.time()
    idx = goto_line
    for epoch in xrange(NEPOCH):
        error = 0
        for x, x_mask, y, y_mask in train_data:
            idx += 1
            cost = model.train(x, x_mask, y, y_mask, lr)
            error += cost
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN Or Inf detected!'
                return -1
            if idx % disp_freq == 0:
                logger.info('epoch: %d idx: %d cost: %f ppl: %f' %
                            (epoch, idx, error / disp_freq,
                             np.exp(error / (1.0 * disp_freq))))
                error = 0
            if idx % save_freq == 0:
                logger.info('dumping...')
                save_model(
                    './model/parameters_%.2f.pkl' % (time.time() - start),
                    model)
            if idx % valid_freq == 0:
                logger.info('validing...')
                valid_cost, wer = evaluate(valid_data, model)
                logger.info(
                    'validation cost: %f perplexity: %f,word_error_rate:%f' %
                    (valid_cost, np.exp(valid_cost), wer))
            if idx % test_freq == 0:
                logger.info('testing...')
                test_cost, wer = evaluate(test_data, model)
                logger.info('test cost: %f perplexity: %f,word_error_rate:%f' %
                            (test_cost, np.exp(test_cost), wer))

    print "Finished. Time = " + str(time.time() - start)
示例#6
0
文件: main.py 项目: cjopengler/lmkit
def test():
    test_data = TextIterator(test_datafile, n_batch=n_batch)
    valid_data = TextIterator(valid_datafile, n_batch=n_batch)
    model = RNNLM(n_input, n_hidden, vocabulary_size, rnn_cell, optimizer, p)
    if os.path.isfile(args.model_dir):
        print 'loading pretrained model:', args.model_dir
        model = load_model(args.model_dir, model)
    else:
        print args.model_dir, 'not found'

    valid_cost, wer = evaluate(valid_data, model)
    logger.info('validation cost: %f perplexity: %f,word_error_rate:%f' %
                (valid_cost, np.exp(valid_cost), wer))
    test_cost, wer = evaluate(test_data, model)
    logger.info('test cost: %f perplexity: %f,word_error_rate:%f' %
                (test_cost, np.exp(test_cost), wer))
示例#7
0
文件: main.py 项目: cjopengler/lmkit
def train(lr):
    # Load data
    logger.info('loading dataset...')

    train_data = TextIterator(train_datafile,
                              filepath,
                              n_batch=n_batch,
                              brown_or_huffman=brown_or_huffman,
                              mode=matrix_or_vector,
                              word2idx_path=word2idx_path)
    valid_data = TextIterator(valid_datafile,
                              filepath,
                              n_batch=n_batch,
                              brown_or_huffman=brown_or_huffman,
                              mode=matrix_or_vector,
                              word2idx_path=word2idx_path)
    test_data = TextIterator(test_datafile,
                             filepath,
                             n_batch=n_batch,
                             brown_or_huffman=brown_or_huffman,
                             mode=matrix_or_vector,
                             word2idx_path=word2idx_path)
    logger.info('building model...')
    model = RNNLM(n_input,
                  n_hidden,
                  vocabulary_size,
                  cell,
                  optimizer,
                  p=p,
                  mode=matrix_or_vector)
    if os.path.exists(model_dir) and reload_dumps == 1:
        logger.info('loading parameters from: %s' % model_dir)
        model = load_model(model_dir, model)
    else:
        logger.info("init parameters....")
    logger.info('training start...')
    start = time.time()
    idx = 0
    for epoch in xrange(NEPOCH):
        error = 0
        for x, x_mask, (y_node, y_choice, y_bit_mask), y_mask in train_data:
            idx += 1
            cost = model.train(x, x_mask, y_node, y_choice, y_bit_mask, y_mask,
                               lr)
            error += cost
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN Or Inf detected!'
                return -1
            if idx % disp_freq == 0:
                logger.info('epoch: %d idx: %d cost: %f ppl: %f' %
                            (epoch, idx, error / disp_freq,
                             np.exp(error / (1.0 * disp_freq))))  #,'lr:',lr
                error = 0
            if idx % save_freq == 0:
                logger.info('dumping...')
                save_model(
                    './model/parameters_%.2f.pkl' % (time.time() - start),
                    model)
            if idx % valid_freq == 0:
                logger.info('validing....')
                valid_cost = evaluate(valid_data, model)
                logger.info('valid_cost: %f perplexity: %f' %
                            (valid_cost, np.exp(valid_cost)))
            if idx % test_freq == 0:
                logger.info('testing...')
                test_cost = evaluate(test_data, model)
                logger.info('test cost: %f perplexity: %f' %
                            (test_cost, np.exp(test_cost)))
            #if idx%clip_freq==0 and lr >=0.01:
            #    print 'cliping learning rate:',
            #    lr=lr*0.9
            #    print lr
        sys.stdout.flush()

    print "Finished. Time = " + str(time.time() - start)