Пример #1
0
        if stochastic == True:
            stochastic_training = True
        else:
            binary_test = True

    print 'Loading the dataset'

    train_set = SVHN(which_set='splitted_train', axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid', axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test', axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (598388, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (6000, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (26032, 3, 32, 32))

    # for hinge loss
    train_set.y = np.subtract(np.multiply(2, train_set.y), 1.)
    valid_set.y = np.subtract(np.multiply(2, valid_set.y), 1.)
    test_set.y = np.subtract(np.multiply(2, test_set.y), 1.)

    print 'Creating the model'

    class DeepCNN(Network):
        def __init__(self, rng):

            Network.__init__(self, n_hidden_layer=8, BN=BN)
Пример #2
0
    train_set = SVHN(
            which_set= 'splitted_train',
            axes= ['b', 'c', 0, 1])
     
    valid_set = SVHN(
        which_set= 'valid',
        axes= ['b', 'c', 0, 1])
    
    test_set = SVHN(
        which_set= 'test',
        axes= ['b', 'c', 0, 1])
    
    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_set.X = np.reshape(np.subtract(np.multiply(2./255.,train_set.X),1.),(-1,3,32,32))
    valid_set.X = np.reshape(np.subtract(np.multiply(2./255.,valid_set.X),1.),(-1,3,32,32))
    test_set.X = np.reshape(np.subtract(np.multiply(2./255.,test_set.X),1.),(-1,3,32,32))
    # print(np.max(train_set.X))
    # print(np.min(train_set.X))
    
    # for hinge loss (targets are already onehot)
    train_set.y = np.subtract(np.multiply(2,train_set.y),1.)
    valid_set.y = np.subtract(np.multiply(2,valid_set.y),1.)
    test_set.y = np.subtract(np.multiply(2,test_set.y),1.)

    print('Building the CNN...') 
    
    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
Пример #3
0
            path= "${SVHN_LOCAL_PATH}",
            axes= ['b', 'c', 0, 1])
     
    valid_set = SVHN(
        which_set= 'valid',
        path= "${SVHN_LOCAL_PATH}",
        axes= ['b', 'c', 0, 1])
    
    test_set = SVHN(
        which_set= 'test',
        path= "${SVHN_LOCAL_PATH}",
        axes= ['b', 'c', 0, 1])
    
    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X,(598388,3,32,32))
    valid_set.X = np.reshape(valid_set.X,(6000,3,32,32))
    test_set.X = np.reshape(test_set.X,(26032,3,32,32))
    
    # for hinge loss
    train_set.y = np.subtract(np.multiply(2,train_set.y),1.)
    valid_set.y = np.subtract(np.multiply(2,valid_set.y),1.)
    test_set.y = np.subtract(np.multiply(2,test_set.y),1.)
    
    print 'Creating the model'
    
    class DeepCNN(Network):

        def __init__(self, rng):

            Network.__init__(self, n_hidden_layer = 8, BN = BN)
    print("shuffle_parts = " + str(shuffle_parts))

    print('Loading SVHN dataset')
    # only load the 73257 training examples, not the extra 531131 examples
    # this is done for computational reasons
    train_set = SVHN(which_set='train', axes=['b', 'c', 0, 1])

    # we only test the train accuracy in this evaluation.
    #    test_set = SVHN(
    #        which_set= 'train',
    #        axes= ['b', 'c', 0, 1])

    print('Building the CNN...')

    # load the randomized dataset that was saved when the training was done.
    train_set.X = np.load('X_values_SVHN.npy')
    train_set.y = np.load('Y_values_SVHN.npy')

    # load the first 7000 samples
    train_set.X = train_set.X[:7000, :, :, :]
    train_set.y = train_set.y[:7000, :]
    print(train_set.X.shape)

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
Пример #5
0
def main(method, LR_start):

    name = "svhn"
    print("dataset = " + str(name))
    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    batch_size = 50
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    activation = lasagne.nonlinearities.rectify

    # number of filters in the first convolutional layer
    K = 64
    print("K=" + str(K))

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
    l_cnn1 = laq.Conv2DLayer(l_in,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha)

    l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation)

    l_cnn2 = laq.Conv2DLayer(l_nl1,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))

    l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha)

    l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation)
    # 256C3-256C3-P2
    l_cnn3 = laq.Conv2DLayer(l_nl2,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha)

    l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation)

    l_cnn4 = laq.Conv2DLayer(l_nl3,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))

    l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha)

    l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation)

    # 512C3-512C3-P2
    l_cnn5 = laq.Conv2DLayer(l_nl4,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha)

    l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation)

    l_cnn6 = laq.Conv2DLayer(l_nl5,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))

    l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha)

    l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation)

    # print(cnn.output_shape)

    # 1024FP-1024FP-10FP
    l_dn1 = laq.DenseLayer(l_nl6,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha)

    l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation)

    l_dn2 = laq.DenseLayer(l_nl7,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha)

    l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation)

    l_dn3 = laq.DenseLayer(l_nl8,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=10,
                           method=method)

    l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":
        # W updates
        W = lasagne.layers.get_all_params(l_out, quantized=True)
        W_grads = laq.compute_grads(loss, l_out)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = laq.clipping_scaling(updates, l_out)

        # other parameters updates
        params = lasagne.layers.get_all_params(l_out,
                                               trainable=True,
                                               quantized=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        ternary_weights = laq.get_quantized_weights(loss, l_out)
        updates2 = OrderedDict()
        idx = 0
        tt_tag = lasagne.layers.get_all_params(l_out, tt=True)
        for tt_tag_temp in tt_tag:
            updates2[tt_tag_temp] = ternary_weights[idx]
            idx = idx + 1
        updates = OrderedDict(updates.items() + updates2.items())

        ## update 2nd momentum
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(l_out, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())

    else:
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(l_out, deterministic=True)

    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    train_fn = theano.function([input, target, LR], loss, updates=updates)

    val_fn = theano.function([input, target], [test_loss, test_err])

    ## load data
    print('Loading SVHN dataset')

    train_set = SVHN(
        which_set='splitted_train',
        # which_set= 'valid',
        path="${SVHN_LOCAL_PATH}",
        axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    train_set.y = np.array(train_set.y).flatten()
    valid_set.y = np.array(valid_set.y).flatten()
    test_set.y = np.array(test_set.y).flatten()

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Training...')

    X_train = train_set.X
    y_train = train_set.y
    X_val = valid_set.X
    y_val = valid_set.y
    X_test = test_set.X
    y_test = test_set.y

    # This function trains the model a full epoch (on the whole dataset)
    def train_epoch(X, y, LR):

        loss = 0
        batches = len(X) / batch_size
        # move shuffle here to save memory
        # k = 5
        # batches = int(batches/k)*k
        shuffled_range = range(len(X))
        np.random.shuffle(shuffled_range)

        for i in range(batches):
            tmp_ind = shuffled_range[i * batch_size:(i + 1) * batch_size]
            newloss = train_fn(X[tmp_ind], y[tmp_ind], LR)
            loss += newloss
        loss /= batches
        return loss

    # This function tests the model a full epoch (on the whole dataset)
    def val_epoch(X, y):

        err = 0
        loss = 0
        batches = len(X) / batch_size

        for i in range(batches):
            new_loss, new_err = val_fn(X[i * batch_size:(i + 1) * batch_size],
                                       y[i * batch_size:(i + 1) * batch_size])
            err += new_err
            loss += new_loss

        err = err / batches * 100
        loss /= batches

        return err, loss

    best_val_err = 100
    best_epoch = 1
    LR = LR_start
    # We iterate over epochs:
    for epoch in range(1, num_epochs + 1):

        start_time = time.time()
        train_loss = train_epoch(X_train, y_train, LR)

        val_err, val_loss = val_epoch(X_val, y_val)

        # test if validation error went down
        if val_err <= best_val_err:

            best_val_err = val_err
            best_epoch = epoch

            test_err, test_loss = val_epoch(X_test, y_test)

        epoch_duration = time.time() - start_time

        # Then we print the results for this epoch:
        print("Epoch " + str(epoch) + " of " + str(num_epochs) + " took " +
              str(epoch_duration) + "s")
        print("  LR:                            " + str(LR))
        print("  training loss:                 " + str(train_loss))
        print("  validation loss:               " + str(val_loss))
        print("  validation error rate:         " + str(val_err) + "%")
        print("  best epoch:                    " + str(best_epoch))
        print("  best validation error rate:    " + str(best_val_err) + "%")
        print("  test loss:                     " + str(test_loss))
        print("  test error rate:               " + str(test_err) + "%")

        with open(
                "{0}/{1}_lr{2}_{3}.txt".format(method, name, LR_start, method),
                "a") as myfile:
            myfile.write(
                "{0}  {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n"
                .format(epoch, train_loss, val_loss, test_loss, val_err,
                        test_err, epoch_duration, LR))

        ## Learning rate update scheme
        if epoch == 15 or epoch == 25:
            LR *= LR_decay
Пример #6
0
    train_set = SVHN(which_set='splitted_train',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    # for hinge loss (targets are already onehot)
    train_set.y = np.subtract(np.multiply(2, train_set.y), 1.)
    valid_set.y = np.subtract(np.multiply(2, valid_set.y), 1.)
    test_set.y = np.subtract(np.multiply(2, test_set.y), 1.)

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)
Пример #7
0
def main(method, LR_start, Binarize_weight_only):

    name = "svhn"
    print("dataset = " + str(name))

    print("Binarize_weight_only=" + str(Binarize_weight_only))

    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    batch_size = 50
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    if Binarize_weight_only == "w":
        activation = lasagne.nonlinearities.rectify
    else:
        activation = lab.binary_tanh_unit
    print("activation = " + str(activation))

    ## number of filters in the first convolutional layer
    K = 64
    print("K=" + str(K))

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
    l_cnn1 = lab.Conv2DLayer(l_in,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha)

    l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation)

    l_cnn2 = lab.Conv2DLayer(l_nl1,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))

    l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha)

    l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation)
    # 256C3-256C3-P2
    l_cnn3 = lab.Conv2DLayer(l_nl2,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha)

    l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation)

    l_cnn4 = lab.Conv2DLayer(l_nl3,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))

    l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha)

    l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation)

    # 512C3-512C3-P2
    l_cnn5 = lab.Conv2DLayer(l_nl4,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha)

    l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation)

    l_cnn6 = lab.Conv2DLayer(l_nl5,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))

    l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha)

    l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation)

    # print(cnn.output_shape)

    # 1024FP-1024FP-10FP
    l_dn1 = lab.DenseLayer(l_nl6,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha)

    l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation)

    l_dn2 = lab.DenseLayer(l_nl7,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha)

    l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation)

    l_dn3 = lab.DenseLayer(l_nl8,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=10,
                           method=method)

    l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":
        # W updates
        W = lasagne.layers.get_all_params(l_out, binary=True)
        W_grads = lab.compute_grads(loss, l_out)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = lab.clipping_scaling(updates, l_out)

        # other parameters updates
        params = lasagne.layers.get_all_params(l_out,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(l_out, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())
    else:
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(l_out, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)
    val_fn = theano.function([input, target], [test_loss, test_err])

    ## load data
    print('Loading SVHN dataset')

    train_set = SVHN(
        which_set='splitted_train',
        # which_set= 'valid',
        path="${SVHN_LOCAL_PATH}",
        axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    train_set.y = np.array(train_set.y).flatten()
    valid_set.y = np.array(valid_set.y).flatten()
    test_set.y = np.array(test_set.y).flatten()

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Training...')

    # ipdb.set_trace()
    lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay,
              num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y,
              test_set.X, test_set.y)