def load_data( maxlen=3000 ):
    ''' Load dataset '''
    train, valid, test = imdb.load_data()
    tr_inp, _, tr_targ = imdb.prepare_data( train[0], train[1], maxlen=maxlen )
    te_inp, _, te_targ = imdb.prepare_data( test[0], test[1], maxlen=maxlen )
    v_inp, _, v_targ = imdb.prepare_data( valid[0], valid[1], maxlen=maxlen )
    train = shuffle( np.transpose( tr_inp ), reformat( np.asarray( tr_targ ), 2 ) )
    test = shuffle( np.transpose( te_inp ), reformat( np.asarray( te_targ ), 2 ) )
    valid = shuffle( np.transpose( v_inp ), reformat( np.asarray( v_targ ), 2 ) )
    print "Train shape : {}, {}".format( train[0].shape, train[1].shape )
    print "Test shape : {}, {}".format( test[0].shape, test[1].shape )
    print "Valid shape : {}, {}".format( valid[0].shape, valid[1].shape )
    imdb_dict = pickle.load( open('imdb.dict.pkl','rb') )
    return train, test, valid, imdb_dict
Exemplo n.º 2
0
def main(unused_args):

    maxlen = 100
    n_words = 10000

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words,
                                        valid_portion=0.05,
                                        maxlen=maxlen)

    train = imdb.prepare_data(train[0], train[1], maxlen=maxlen)
    valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen)
    test = imdb.prepare_data(test[0], test[1], maxlen=maxlen)

    for data in [train, valid, test]:
        print(data[0].shape, data[1].shape, data[2].shape)

    config = get_config()
    eval_config = get_config()
    #eval_config.batch_size = 1
    #eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = SentimentModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = SentimentModel(is_training=False, config=config)
            mtest = SentimentModel(is_training=False, config=config)

        tf.initialize_all_variables().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            start_time = time.time()
            train_acc = run_epoch(session, m, train, m.train_op)
            print("Training Accuracy = %.4f, time = %.3f seconds\n" %
                  (train_acc, time.time() - start_time))
            valid_acc = run_epoch(session, mvalid, valid, tf.no_op())
            print("Valid Accuracy = %.4f\n" % valid_acc)

        test_acc = run_epoch(session, mtest, test, tf.no_op())
        print("Test Accuracy = %.4f\n" % test_acc)
    def generate_data(self):
        '''Load the dataset

        Generate train, valid and test dataset

        '''
        print("Loading data...")
        train, valid, _ = load_data(path=self.path)
        self.X_train, self.X_mask_train, self.Y_train = prepare_data(train[0], train[1], maxlen=self.maxlen)
        self.X_valid, self.X_mask_valid, self.Y_valid = prepare_data(valid[0], valid[1], maxlen=self.maxlen)
        del train, valid
        print(len(self.X_train), 'train sequences')
        print(len(self.X_valid), 'valid sequences')
        print("Pad sequences (samples x time)")
        self.X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen)
        self.X_valid = sequence.pad_sequences(self.X_valid, maxlen=self.maxlen)
        print('X_train shape:', self.X_train.shape)
        print('X_valid shape:', self.X_valid.shape)
Exemplo n.º 4
0
def main(unused_args):
    
    maxlen = 100
    n_words = 10000

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen)

    train = imdb.prepare_data(train[0], train[1], maxlen=maxlen)
    valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen)
    test = imdb.prepare_data(test[0], test[1], maxlen=maxlen)

    for data in [train, valid, test]:
        print(data[0].shape, data[1].shape, data[2].shape)

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = SentimentModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse = True, initializer=initializer):
            mvalid = SentimentModel(is_training=False, config=config)
            mtest = SentimentModel(is_training=False, config=config)

        tf.initialize_all_variables().run()
        
        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            start_time = time.time()
            train_acc = run_epoch(session, m, train, m.train_op) 
            print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time))
            valid_acc = run_epoch(session, mvalid, valid, tf.no_op())
            print("Valid Accuracy = %.4f\n" % valid_acc)

        test_acc = run_epoch(session, mtest, test, tf.no_op())
        print("Test Accuracy = %.4f\n" % test_acc)
Exemplo n.º 5
0
    print('Build model ')
    X,Mask,Y,\
    cost,err, \
    train_function, valid_function, predict_function = build_model(vocab_size=vocab_size,
                                                                    embsize=embsize,
                                                                    hiddensize=hiddensize)

    print('Training ')
    for eidx in range(max_epochs):
        kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True)
        costs = []
        errs = []
        for _, train_index in kf:
            # Select the random examples for this minibatch
            y = [train[1][t] for t in train_index]
            x = [train[0][t] for t in train_index]

            # Get the data in numpy.ndarray format
            # This swap the axis!
            # Return something of shape (minibatch maxlen, n samples)
            x, mask, y = imdb.prepare_data(x, y)
            cost, err = train_function(x, mask, y)
            if np.isnan(cost) or np.isnan(err):
                continue
            costs.append(float(cost))
            errs.append(float(err))
        costs = np.array(costs)
        errs = np.array(errs)
        print "Epoch {0}: Cost {1} Err {2}".format(eidx, np.mean(costs),
                                                   np.mean(errs))
Exemplo n.º 6
0
def train_lstm(
    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=50,  # The maximum number of epoch to run
    dispFreq=10,  # Display to stdout the training progress every N updates
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='save/lstm_model.npz',  # The best model will be saved there
    validFreq=370,  # Compute the validation error after this number of update.
    saveFreq=50,  # Save the parameters after every saveFreq updates
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
):

    # Model options
    model_options = locals().copy()
    print("model options", model_options)

    #load_data, prepare_data = get_dataset(dataset)

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = np.arange(len(test[0]))
        np.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = np.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = T.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = T.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = T.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print("%d train examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))
    print("%d test examples" % len(test[0]))

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) // batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) // batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in range(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t]for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = imdb.prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print('bad cost detected: ', cost)
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)

                if saveto and np.mod(uidx, saveFreq) == 0:
                    print('Saving...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print('Done')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, imdb.prepare_data, train, kf)
                    valid_err = pred_error(f_pred, imdb.prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (best_p is None or
                        valid_err <= np.array(history_errs)[:,
                                                               0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Train ', train_err, 'Valid ', valid_err,
                           'Test ', test_err)

                    if (len(history_errs) > patience and
                        valid_err >= np.array(history_errs)[:-patience,
                                                               0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print('Early Stop!')
                            estop = True
                            break

            print('Seen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print("Training interupted")

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, imdb.prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test)

    print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err )
    if saveto:
        np.savez(saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    print('The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    print( ('Training took %.1fs' %
            (end_time - start_time)), file=sys.stderr)
    return train_err, valid_err, test_err
Exemplo n.º 7
0
def prepare_data_sp(x, y, maxlen=None):
  x, mask, y = imdb.prepare_data(x, y, maxlen)
  return (x.transpose(), mask.transpose(), y)
Exemplo n.º 8
0
    train_function, valid_function, predict_function = build_model(vocab_size=vocab_size,
                                                                    embsize=embsize,
                                                                    hiddensize=hiddensize)

    print('Training ')
    for eidx in range(max_epochs):
        kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True)
        costs = []
        errs = []
        for _, train_index in kf:
            # Select the random examples for this minibatch
            y = [train[1][t] for t in train_index]
            x = [train[0][t]for t in train_index]

            # Get the data in numpy.ndarray format
            # This swap the axis!
            # Return something of shape (minibatch maxlen, n samples)
            x, mask, y = imdb.prepare_data(x, y)
            cost, err = train_function(x, mask, y)
            if np.isnan(cost) or np.isnan(err):
                continue
            costs.append(float(cost))
            errs.append(float(err))
        costs = np.array(costs)
        errs = np.array(errs)
        print "Epoch {0}: Cost {1} Err {2}".format(eidx, np.mean(costs), np.mean(errs))




Exemplo n.º 9
0
def prepare_data_sp(x, y, maxlen=None):
    x, mask, y = imdb.prepare_data(x, y, maxlen)
    return (x.transpose(), mask.transpose(), y)
Exemplo n.º 10
0
from imdb import load_data, prepare_data
import numpy as np
import pickle as pkl

train, valid, test = load_data(n_words=10, valid_portion=0.05)
x = [train[0][t] for t in range(0, len(train[0]))]
y = [train[1][t] for t in range(0, len(train[1]))]
x, mask, y = prepare_data(x, y)
y = np.array(y)
feat_train = np.zeros((x.shape[0], x.shape[1], 10))
for i in range(0, x.shape[0]):
    print "num: " + str(i)
    for j in range(0, x.shape[1]):
        feat_train[i][j][x[i][j]] = 1
np.save("data/feats_train.npy", feat_train)
np.save("data/labels_train.npy", y)
np.save("data/mask_train.npy", mask)