예제 #1
0
def train_svr(dataset=''):
    train, valid, test = read_data(columns=1)
    x_train = [[x[0] for x in row] for row in train[0]]
    x_test = [[x[0] for x in row] for row in test[0]]

    svr = svm.SVR()
    svr.fit(x_train, train[1])

    pred = svr.predict(x_test)
    y = numpy.asarray(test[1], dtype='float32')
    pred = numpy.asarray(pred, dtype='float32')

    cost = ((y - pred)**2).mean()
    print 'Cost on Test sample, size: %d, cost: %f' % (len(x_test), cost)
예제 #2
0
def train_svr(dataset=''):
   train, valid, test = read_data(columns=1) 
   x_train = [[x[0] for x in row] for row in train[0]]
   x_test = [[x[0] for x in row] for row in test[0]]


   svr = svm.SVR()
   svr.fit(x_train,train[1])
    
   pred = svr.predict(x_test)
   y = numpy.asarray(test[1], dtype='float32')
   pred = numpy.asarray(pred, dtype='float32')

   cost = ((y-pred)**2).mean()
   print 'Cost on Test sample, size: %d, cost: %f'%(len(x_test),cost)
예제 #3
0
def train_svr(dataset=''):
    train, valid, test, mean, std = read_data(columns=1, max_len=10)
    x_train = [[x[0] for x in row] for row in train[0]]
    x_test = [[x[0] for x in row] for row in test[0]]

    svr = svm.SVR()
    svr.fit(x_train, train[1])

    pred = svr.predict(x_test)
    y = numpy.asarray(test[1], dtype='float32')
    pred = numpy.asarray(pred, dtype='float32')

    #y = y*std + mean
    #pred = pred*std + mean
    mean_y = y.mean()
    ssr = ((y - pred)**2).sum()
    sst = ((y - mean_y)**2).sum()

    r2 = 1. - (ssr / sst)
    cost = ((y - pred)**2).mean()
    print 'Cost on Test sample, size: %d, cost: %f, R score: %f' % (
        len(x_test), cost, r2)
def train_svr(dataset=''):
   train, valid, test, mean, std = read_data(columns=1, max_len=10) 
   x_train = [[x[0] for x in row] for row in train[0]]
   x_test = [[x[0] for x in row] for row in test[0]]


   svr = svm.SVR()
   svr.fit(x_train,train[1])
    
   pred = svr.predict(x_test)
   y = numpy.asarray(test[1], dtype='float32')
   pred = numpy.asarray(pred, dtype='float32')

   #y = y*std + mean
   #pred = pred*std + mean
   mean_y = y.mean()
   ssr = ((y - pred)**2).sum()
   sst = ((y - mean_y)**2).sum()

   r2 = 1. - (ssr/sst)
   cost = ((y-pred)**2).mean()
   print 'Cost on Test sample, size: %d, cost: %f, R score: %f'%(len(x_test),cost,r2)
예제 #5
0
def train_lstm(
        dim_proj=32,  # word embeding dimension and LSTM number of hidden units.
        patience=10,  # Number of epoch to wait before early stop if no progress
        max_epochs=150,  # The maximum number of epoch to run
        dispFreq=10,  # Display to stdout the training progress every N updates
        decay_c=0.,  # Weight decay for the classifier applied to the U weights.
        lrate=0.1,  # Learning rate for sgd (not used for adadelta and rmsprop)
        n_input=4,  # Vocabulary size
        optimizer=mom_sgd,  # sgd,mom_sgs, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
        encoder='lstm',  # TODO: can be removed must be lstm.
        saveto='lstm_model.npz',  # The best model will be saved there
        validFreq=170,  # Compute the validation error after this number of update.
        saveFreq=1110,  # Save the parameters after every saveFreq updates
        maxlen=100,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=64,  # The batch size used for validation/test set.
        dataset='imdb',

        # Parameter for extra option
        noise_std=0.,
        use_dropout=False,  # if False slightly faster, but worst test error
        # This frequently need a bigger model.
    reload_model="",  # Path to a saved model we want to start from.
        sum_pool=False,
        mom_start=0.5,
        mom_end=0.99,
        mom_epoch_interval=100,
        learning_rate_decay=0.9995):

    # Model options
    model_options = locals().copy()
    print "model options", model_options

    print 'Loading data'
    ydim = 1
    n_iter = 10

    train, valid, test = read_data(max_len=n_iter)

    #YDIM??
    #number of labels (output)

    model_options['ydim'] = ydim
    model_options['n_iter'] = n_iter

    theano.config.optimizer = 'None'

    print 'Building model'
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, y, f_pred_prob, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost)

    print 'Optimization'

    kf_valid = get_minibatches_idx(len(valid[0]),
                                   valid_batch_size,
                                   shuffle=True)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, shuffle=True)

    print "%d train examples" % len(train[0])
    print "%d valid examples" % len(valid[0])
    print "%d test examples" % len(test[0])
    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.clock()
    mom = 0

    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray formet.
                # It return something of the shape (minibatch maxlen, n samples)
                x, y = prepare_data(x, y, model_options['n_iter'],
                                    model_options['n_input'])

                if x is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    continue
                n_samples += x.shape[1]
                if eidx < model_options['mom_epoch_interval']:
                    mom = model_options['mom_start']*\
                    (1.0 - eidx/model_options['mom_epoch_interval'])\
                      + mom_end*(eidx/model_options['mom_epoch_interval'])
                else:
                    mom = mom_end

                cost = f_grad_shared(x, y)
                f_update(lrate, mom)

                #decay
                lrate = learning_rate_decay * lrate

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

                if numpy.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Done'

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    #train_err = pred_error(f_pred_prob, prepare_data, train, kf, model_options)
                    valid_err = pred_error(f_pred_prob, prepare_data, valid,
                                           kf_valid, model_options)
                    test_err = pred_error(f_pred_prob, prepare_data, test,
                                          kf_test, model_options)
                    r_score = R_score(f_pred_prob, prepare_data, test, kf_test,
                                      model_options)

                    history_errs.append([valid_err, test_err])

                    if (uidx == 0 or valid_err <=
                            numpy.array(history_errs)[:, 0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Valid ', valid_err, 'Test ', test_err, 'R_score ',
                          r_score)

                    if (len(history_errs) > patience and valid_err >=
                            numpy.array(history_errs)[:-patience, 0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.clock()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    train_err = pred_error(f_pred_prob, prepare_data, train, kf, model_options)
    valid_err = pred_error(f_pred_prob, prepare_data, valid, kf_valid,
                           model_options)
    test_err = pred_error(f_pred_prob, prepare_data, test, kf_test,
                          model_options)
    r_score = R_score(f_pred_prob, prepare_data, test, kf_test, model_options)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, 'R2 score ', r_score

    numpy.savez(saveto,
                train_err=train_err,
                valid_err=valid_err,
                test_err=test_err,
                history_errs=history_errs,
                **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time))
    return train_err, valid_err, test_err
def train_lstm(
    dim_proj=32,  # word embeding dimension and LSTM number of hidden units.
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=150,  # The maximum number of epoch to run
    dispFreq=10,  # Display to stdout the training progress every N updates
    decay_c=0.0,  # Weight decay for the classifier applied to the U weights.
    lrate=0.1,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_input=4,  # Vocabulary size
    optimizer=mom_sgd,  # sgd,mom_sgs, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder="lstm",  # TODO: can be removed must be lstm.
    saveto="lstm_model.npz",  # The best model will be saved there
    validFreq=170,  # Compute the validation error after this number of update.
    saveFreq=1110,  # Save the parameters after every saveFreq updates
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset="imdb",
    # Parameter for extra option
    noise_std=0.0,
    use_dropout=False,  # if False slightly faster, but worst test error
    # This frequently need a bigger model.
    reload_model="",  # Path to a saved model we want to start from.
    sum_pool=False,
    mom_start=0.5,
    mom_end=0.99,
    mom_epoch_interval=300,
    learning_rate_decay=0.99995,
):

    # Model options
    model_options = locals().copy()
    print "model options", model_options

    print "Loading data"
    ydim = 2
    n_iter = 10

    train, valid, test, mean, std = read_data(max_len=n_iter, up=True)

    # YDIM??
    # number of labels (output)

    model_options["ydim"] = ydim
    model_options["n_iter"] = n_iter

    theano.config.optimizer = "None"

    print "Building model"
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params("lstm_model.npz", params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, y, f_pred_prob, cost) = build_model(tparams, model_options)

    if decay_c > 0.0:
        decay_c = theano.shared(numpy.float32(decay_c), name="decay_c")
        weight_decay = 0.0
        weight_decay += (tparams["U"] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, y], cost, name="f_cost")

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, y], grads, name="f_grad")

    lr = tensor.scalar(name="lr")
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost)

    print "Optimization"

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size, shuffle=True)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size, shuffle=True)

    print "%d train examples" % len(train[0])
    print "%d valid examples" % len(valid[0])
    print "%d test examples" % len(test[0])
    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.clock()
    mom = 0

    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.0)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray formet.
                # It return something of the shape (minibatch maxlen, n samples)
                x, y = prepare_data(x, y, model_options["n_iter"], model_options["n_input"], up=True)

                if x is None:
                    print "Minibatch with zero sample under length ", maxlen
                    continue
                n_samples += x.shape[1]
                if eidx < model_options["mom_epoch_interval"]:
                    mom = model_options["mom_start"] * (1.0 - eidx / model_options["mom_epoch_interval"]) + mom_end * (
                        eidx / model_options["mom_epoch_interval"]
                    )
                else:
                    mom = mom_end

                cost = f_grad_shared(x, y)
                f_update(lrate, mom)

                # decay
                lrate = learning_rate_decay * lrate

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print "NaN detected"
                    return 1.0, 1.0, 1.0

                if numpy.mod(uidx, dispFreq) == 0:
                    print "Epoch ", eidx, "Update ", uidx, "Cost ", cost

                if numpy.mod(uidx, saveFreq) == 0:
                    print "Saving...",

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open("%s.pkl" % saveto, "wb"), -1)
                    print "Done"

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.0)
                    # train_err = pred_error(f_pred_prob, prepare_data, train, kf, model_options)
                    valid_err = pred_error(f_pred_prob, prepare_data, valid, kf_valid, model_options)
                    test_err = pred_error(f_pred_prob, prepare_data, test, kf_test, model_options)

                    history_errs.append([valid_err, test_err])

                    if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min():

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print ("Valid ", valid_err, "Test ", test_err)

                    if len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min():
                        bad_counter += 1
                        if bad_counter > patience:
                            print "Early Stop!"
                            estop = True
                            break

            print "Seen %d samples" % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.clock()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.0)
    train_err = pred_error(f_pred_prob, prepare_data, train, kf, model_options)
    valid_err = pred_error(f_pred_prob, prepare_data, valid, kf_valid, model_options)
    test_err = pred_error(f_pred_prob, prepare_data, test, kf_test, model_options)

    print "Train ", train_err, "Valid ", valid_err, "Test ", test_err

    numpy.savez(
        saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p
    )
    print "The code run for %d epochs, with %f sec/epochs" % ((eidx + 1), (end_time - start_time) / (1.0 * (eidx + 1)))
    print >> sys.stderr, ("Training took %.1fs" % (end_time - start_time))
    return train_err, valid_err, test_err