class TestSVHN(unittest.TestCase): #WARN: SVHN tries to overwrite files as part of __init__. def setUp(self): try: self.train = SVHN(which_set='train') self.test = SVHN(which_set='test') except (NoDataPathError, NotInstalledError): raise SkipTest() def test_get_test_set(self): try: test_from_train = self.train.get_test_set() self.assertTrue(numpy.all(test_from_train.get_design_matrix() == self.test.get_design_matrix())) self.assertTrue(numpy.all(test_from_train.get_targets() == self.test.get_targets())) except (NoDataPathError, NotInstalledError): raise SkipTest()
def load_dataset(self): # TODO: we might need other variables for identifying what kind of # extra preprocessing was done such as features product and number # of features kept based on MI. #base_path = get_data_path(self.state) #self.base_path = base_path #import pdb #pdb.set_trace() if self.state.dataset == 'mnist': self.test_ddm = MNIST(which_set='test', one_hot=True) dataset = MNIST(which_set='train', shuffle=True, one_hot=True) train_X, valid_X = np.split(dataset.X, [50000]) train_y, valid_y = np.split(dataset.y, [50000]) self.train_ddm = DenseDesignMatrix(X=train_X, y=train_y) self.valid_ddm = DenseDesignMatrix(X=valid_X, y=valid_y) elif self.state.dataset == 'svhn': self.train_ddm = SVHN(which_set='splitted_train') self.test_ddm = SVHN(which_set='test') self.valid_ddm = SVHN(which_set='valid') elif self.state.dataset == 'cifar10': self.train_ddm = My_CIFAR10(which_set='train', one_hot=True) self.test_ddm = None self.valid_ddm = My_CIFAR10(which_set='test', one_hot=True) if self.train_ddm is not None: self.nvis = self.train_ddm.X.shape[1] self.nout = self.train_ddm.y.shape[1] print "nvis, nout :", self.nvis, self.nout self.ntrain = self.train_ddm.X.shape[0] print "ntrain :", self.ntrain if self.valid_ddm is not None: self.nvalid = self.valid_ddm.X.shape[0] print "nvalid :", self.nvalid if self.test_ddm is not None: self.ntest = self.test_ddm.X.shape[0] print "ntest :", self.ntest
def get_dim_input(state): if state.dataset == 'mnist': dataset = MNIST(which_set='test') dim = dataset.X.shape[1] elif state.dataset == 'svhn': dataset = SVHN(which_set='test') dim = dataset.X.shape[1] elif state.dataset == 'cifar10': dataset = My_CIFAR10(which_set='test') dim = dataset.X.shape[1] else: raise ValueError( 'only mnist, cifar10 and svhn are supported for now in get_dim_input' ) del dataset return dim
# Old hyperparameters binary_training = False stochastic_training = False binary_test = False stochastic_test = False if BinaryConnect == True: binary_training = True if stochastic == True: stochastic_training = True else: binary_test = True print 'Loading the dataset' train_set = SVHN(which_set='splitted_train', axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (598388, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (6000, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (26032, 3, 32, 32)) # for hinge loss train_set.y = np.subtract(np.multiply(2, train_set.y), 1.) valid_set.y = np.subtract(np.multiply(2, valid_set.y), 1.) test_set.y = np.subtract(np.multiply(2, test_set.y), 1.)
print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # for SVHN, depending on available CPU memory # 1, 2, 4, 7 or 14 shuffle_parts = 1 # shuffle_parts = 2 # does not work on bart5 # shuffle_parts = 4 # seems to work on bart5 # shuffle_parts = 7 # just to be safe print("shuffle_parts = " + str(shuffle_parts)) print('Loading SVHN dataset') train_set = SVHN(which_set='train', axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', axes=['b', 'c', 0, 1]) # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) # print(np.max(train_set.X))
# Old hyperparameters binary_training=False stochastic_training=False binary_test=False stochastic_test=False if BinaryConnect == True: binary_training=True if stochastic == True: stochastic_training=True else: binary_test=True print 'Loading the dataset' train_set = SVHN( which_set= 'splitted_train', path= "${SVHN_LOCAL_PATH}", axes= ['b', 'c', 0, 1]) valid_set = SVHN( which_set= 'valid', path= "${SVHN_LOCAL_PATH}", axes= ['b', 'c', 0, 1]) test_set = SVHN( which_set= 'test', path= "${SVHN_LOCAL_PATH}", axes= ['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X,(598388,3,32,32))
if not os.path.isfile(os.path.join(local_path, d_set)): logging.info("Copying data from {0} to {1}".format( os.path.join(local_path, d_set), local_path)) shutil.copyfile(os.path.join(orig_path, d_set), os.path.join(local_path, d_set)) def check_dtype(data): if str(data.X.dtype) != config.floatX: logging.warning("The dataset is saved as {}, changing theano's floatX " \ "to the same dtype".format(data.X.dtype)) config.floatX = str(data.X.dtype) # Load train data train = SVHN('splitted_train', path=local_path) check_dtype(train) # prepare preprocessing pipeline = preprocessing.Pipeline() # without batch_size there is a high chance that you might encounter memory error # or pytables crashes pipeline.items.append( preprocessing.GlobalContrastNormalization(batch_size=5000)) pipeline.items.append(preprocessing.LeCunLCN((32, 32))) # apply the preprocessings to train train.apply_preprocessor(pipeline, can_fit=True) del train # load and preprocess valid
LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # for SVHN, depending on available CPU memory # 1, 2, 4, 7 or 14 shuffle_parts = 1 # shuffle_parts = 2 # does not work on bart5 # shuffle_parts = 4 # seems to work on bart5 # shuffle_parts = 7 # just to be safe print("shuffle_parts = " + str(shuffle_parts)) print('Loading SVHN dataset') # only load the 73257 training examples, not the extra 531131 examples # this is done for computational reasons train_set = SVHN(which_set='train', axes=['b', 'c', 0, 1]) # we only test the train accuracy in this evaluation. # test_set = SVHN( # which_set= 'train', # axes= ['b', 'c', 0, 1]) print('Building the CNN...') # load the randomized dataset that was saved when the training was done. train_set.X = np.load('X_values_SVHN.npy') train_set.y = np.load('Y_values_SVHN.npy') # load the first 7000 samples train_set.X = train_set.X[:7000, :, :, :] train_set.y = train_set.y[:7000, :]
import os from pylearn2.datasets.svhn import SVHN from pylearn2.utils.string_utils import preprocess assert 'PYLEARN2_DATA_PATH' in os.environ, "PYLEARN2_DATA_PATH not defined" orig_path = preprocess('${PYLEARN2_DATA_PATH}/SVHN/format2/') # Check if MAT files have been downloaded if not os.path.isdir(orig_path): raise IOError("You need to download the SVHN format2 dataset MAT files " "before running this conversion script.") # Create directory in which to save the pytables files local_path = orig_path if not os.path.isdir(os.path.join(local_path, 'h5')): os.makedirs(os.path.join(local_path, 'h5')) print("***************************************************************\n" "Please ignore the warning produced during this MAT -> Pytables\n" "conversion for the SVHN dataset. If you are creating the\n" "pytables for the first time then no files are modified/over-written,\n" "they are simply written for the first time.\n" "***************************************************************\n") test = SVHN('test', path=local_path) valid = SVHN('valid', path=local_path) train = SVHN('splitted_train', path=local_path)
def setUp(self): try: self.train = SVHN(which_set='train') self.test = SVHN(which_set='test') except (NoDataPathError, NotInstalledError): raise SkipTest()
def main(method, LR_start): name = "svhn" print("dataset = " + str(name)) print("Method = " + str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters batch_size = 50 print("batch_size = " + str(batch_size)) num_epochs = 50 print("num_epochs = " + str(num_epochs)) print("LR_start = " + str(LR_start)) LR_decay = 0.1 print("LR_decay=" + str(LR_decay)) # BTW, LR decay might good for the BN moving average... activation = lasagne.nonlinearities.rectify # number of filters in the first convolutional layer K = 64 print("K=" + str(K)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = laq.Conv2DLayer(l_in, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation) l_cnn2 = laq.Conv2DLayer(l_nl1, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation) # 256C3-256C3-P2 l_cnn3 = laq.Conv2DLayer(l_nl2, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation) l_cnn4 = laq.Conv2DLayer(l_nl3, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation) # 512C3-512C3-P2 l_cnn5 = laq.Conv2DLayer(l_nl4, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation) l_cnn6 = laq.Conv2DLayer(l_nl5, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = laq.DenseLayer(l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation) l_dn2 = laq.DenseLayer(l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation) l_dn3 = laq.DenseLayer(l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method=method) l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if method != "FPN": # W updates W = lasagne.layers.get_all_params(l_out, quantized=True) W_grads = laq.compute_grads(loss, l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = laq.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False) updates = OrderedDict(updates.items() + optimizer.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also ternary_weights = laq.get_quantized_weights(loss, l_out) updates2 = OrderedDict() idx = 0 tt_tag = lasagne.layers.get_all_params(l_out, tt=True) for tt_tag_temp in tt_tag: updates2[tt_tag_temp] = ternary_weights[idx] idx = idx + 1 updates = OrderedDict(updates.items() + updates2.items()) ## update 2nd momentum updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[ idx] * W_grads[idx] * (1 - beta2) idx = idx + 1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) ## load data print('Loading SVHN dataset') train_set = SVHN( which_set='splitted_train', # which_set= 'valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32)) train_set.y = np.array(train_set.y).flatten() valid_set.y = np.array(valid_set.y).flatten() test_set.y = np.array(test_set.y).flatten() # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. print('Training...') X_train = train_set.X y_train = train_set.y X_val = valid_set.X y_val = valid_set.y X_test = test_set.X y_test = test_set.y # This function trains the model a full epoch (on the whole dataset) def train_epoch(X, y, LR): loss = 0 batches = len(X) / batch_size # move shuffle here to save memory # k = 5 # batches = int(batches/k)*k shuffled_range = range(len(X)) np.random.shuffle(shuffled_range) for i in range(batches): tmp_ind = shuffled_range[i * batch_size:(i + 1) * batch_size] newloss = train_fn(X[tmp_ind], y[tmp_ind], LR) loss += newloss loss /= batches return loss # This function tests the model a full epoch (on the whole dataset) def val_epoch(X, y): err = 0 loss = 0 batches = len(X) / batch_size for i in range(batches): new_loss, new_err = val_fn(X[i * batch_size:(i + 1) * batch_size], y[i * batch_size:(i + 1) * batch_size]) err += new_err loss += new_loss err = err / batches * 100 loss /= batches return err, loss best_val_err = 100 best_epoch = 1 LR = LR_start # We iterate over epochs: for epoch in range(1, num_epochs + 1): start_time = time.time() train_loss = train_epoch(X_train, y_train, LR) val_err, val_loss = val_epoch(X_val, y_val) # test if validation error went down if val_err <= best_val_err: best_val_err = val_err best_epoch = epoch test_err, test_loss = val_epoch(X_test, y_test) epoch_duration = time.time() - start_time # Then we print the results for this epoch: print("Epoch " + str(epoch) + " of " + str(num_epochs) + " took " + str(epoch_duration) + "s") print(" LR: " + str(LR)) print(" training loss: " + str(train_loss)) print(" validation loss: " + str(val_loss)) print(" validation error rate: " + str(val_err) + "%") print(" best epoch: " + str(best_epoch)) print(" best validation error rate: " + str(best_val_err) + "%") print(" test loss: " + str(test_loss)) print(" test error rate: " + str(test_err) + "%") with open( "{0}/{1}_lr{2}_{3}.txt".format(method, name, LR_start, method), "a") as myfile: myfile.write( "{0} {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n" .format(epoch, train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR)) ## Learning rate update scheme if epoch == 15 or epoch == 25: LR *= LR_decay
W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR LR_start = 0.01 print("LR_start = " + str(LR_start)) LR_fin = 0.000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... print('Loading SVHN dataset') train_set = SVHN(which_set='splitted_train', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))
def load_data(dataset, train_percent=0.8, val_percent=0.2): """ Load MNIST, CIFAR-10 dataset dataset: string: MNIST or CIFAR-10 train_percent: float: percentage of the dataset to be used for training val_per: string: percentage of the dataset to be used for the validation purpose Output: (train_x, val_x, test_x, train_y, val_y, test_y) """ zero_mean = False if(dataset.lower() == 'mnist'): print('Loading MNIST dataset from pylearn2') train_set_size = int(_DATASET_SIZE['mnist'] * train_percent) train_data = MNIST(which_set='train', start=0, stop=train_set_size, center=zero_mean) val_data = MNIST(which_set='train', start=train_set_size, stop=_DATASET_SIZE[dataset], center=zero_mean) test_data = MNIST(which_set='test', center=zero_mean) # convert labels into 1D array train_data.y = np.hstack(train_data.y) val_data.y = np.hstack(val_data.y) test_data.y = np.hstack(test_data.y) # create 10 dimensional vector corresponding to each label train_data.y = np.float32(np.eye(10))[train_data.y] val_data.y = np.float32(np.eye(10))[val_data.y] test_data.y = np.float32(np.eye(10))[test_data.y] # TODO: convert the data to range [-1,1] # reshape the data into image size(#images, channels, height, width). # Each row contains an image in the original dataset train_data.X = np.reshape(train_data.X, (-1, 1, 28, 28)) val_data.X = np.reshape(val_data.X, (-1, 1, 28, 28)) test_data.X = np.reshape(test_data.X, (-1, 1, 28, 28)) # convert to [-1 1] range train_data.X = train_data.X * 2.0 - 1.0 val_data.X = val_data.X * 2.0 - 1.0 test_data.X = test_data.X * 2.0 - 1.0 elif(dataset.lower() == 'cifar10'): print('Loading CIFAR-10 dataset from pylearn2') train_set_size = int(_DATASET_SIZE['cifar10'] * train_percent) train_data = CIFAR10(which_set='train', start=0, stop=train_set_size) val_data = CIFAR10(which_set='train', start=train_set_size, stop=50000) test_data = CIFAR10(which_set='test') # convert labels into 1D array train_data.y = np.hstack(train_data.y) val_data.y = np.hstack(val_data.y) test_data.y = np.hstack(test_data.y) # create 10 dimensional vector corresponding to each label train_data.y = np.float32(np.eye(10))[train_data.y] val_data.y = np.float32(np.eye(10))[val_data.y] test_data.y = np.float32(np.eye(10))[test_data.y] # TODO: convert the data to range [-1,1] # reshape the data into image size(#images, channels, height, width). # Each row contains an image in the original dataset train_data.X = np.reshape(train_data.X, (-1, 3, 32, 32)) val_data.X = np.reshape(val_data.X, (-1, 3, 32, 32)) test_data.X = np.reshape(test_data.X, (-1, 3, 32, 32)) # convert to [-1 1] range train_data.X = train_data.X * (2.0/255) - 1.0 val_data.X = val_data.X * (2.0/255) - 1.0 test_data.X = test_data.X * (2.0/255) - 1.0 elif(dataset.lower() == 'svhn'): train_data = SVHN(which_set= 'splitted_train', axes= ['b', 'c', 0, 1]) val_data = SVHN(which_set= 'valid', axes= ['b', 'c', 0, 1]) test_data = SVHN(which_set= 'test', axes= ['b', 'c', 0, 1]) # convert labels into 1D array train_data.y = np.hstack(train_data.y) val_data.y = np.hstack(val_data.y) test_data.y = np.hstack(test_data.y) # create 10 dimensional vector corresponding to each label train_data.y = np.float32(np.eye(10))[train_data.y] val_data.y = np.float32(np.eye(10))[val_data.y] test_data.y = np.float32(np.eye(10))[test_data.y] # convert to [-1, 1] range train_data.X = np.reshape(np.subtract(np.multiply(2.0/255, train_data.X), 1.0), (-1, 3, 32, 32)) val_data.X = np.reshape(np.subtract(np.multiply(2.0/255, val_data.X), 1.0), (-1, 3, 32, 32)) test_data.X = np.reshape(np.subtract(np.multiply(2.0/255, test_data.X), 1.0), (-1, 3, 32, 32)) else: print('This dataset is not supported. Only MNIST and CIFAR-10 are supported as of now.') raise ValueError('Dataset is not supported') print('Trainset shape = ', train_data.X.shape, train_data.y.shape) print('Valset shape = ', val_data.X.shape, val_data.y.shape) print('Testset shape = ', test_data.X.shape, test_data.y.shape) return train_data.X, val_data.X, test_data.X, train_data.y, val_data.y, test_data.y
preprocessor=preprocessor, start=45000, stop=50000) test_set = ZCA_Dataset(preprocessed_dataset=serial.load( "${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), preprocessor=preprocessor) # for both datasets, onehot the target train_set.y = np.float32(onehot(train_set.y)) valid_set.y = np.float32(onehot(valid_set.y)) test_set.y = np.float32(onehot(test_set.y)) elif dataset == "SVHN": train_set = SVHN(which_set='splitted_train', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) print 'Creating the model' # storing format hyperparameters format = sys.argv[2]
class HPS: def __init__(self, state, base_channel_names = ['train_objective'], save_prefix = "model_", cache_dataset = True): self.cache_dataset = cache_dataset self.dataset_cache = {} self.state = state self.mbsb_channel_name = self.state.term_array.early_stopping.save_best_channel self.base_channel_names = base_channel_names self.save_prefix = save_prefix # TODO store this in data for each experiment or dataset def run(self): (model, learner, algorithm) \ = self.get_config() # try: print 'learning' learner.main_loop() # except Exception, e: # print e print 'End of model training' def get_config(self): # dataset self.load_dataset() # model self.load_model() # monitor: self.setup_monitor() # training algorithm algorithm = self.get_train() # extensions extensions = self.get_extensions() # channels #self.setup_channels() # learner learner = Train(dataset=self.train_ddm, model=self.model, algorithm=algorithm, extensions=extensions) return (self.model, learner, algorithm) def load_dataset(self): # TODO: we might need other variables for identifying what kind of # extra preprocessing was done such as features product and number # of features kept based on MI. #base_path = get_data_path(self.state) #self.base_path = base_path #import pdb #pdb.set_trace() if self.state.dataset == 'mnist': self.test_ddm = MNIST(which_set='test', one_hot=True) dataset = MNIST(which_set='train', shuffle=True, one_hot=True) train_X, valid_X = np.split(dataset.X, [50000]) train_y, valid_y = np.split(dataset.y, [50000]) self.train_ddm = DenseDesignMatrix(X=train_X, y=train_y) self.valid_ddm = DenseDesignMatrix(X=valid_X, y=valid_y) elif self.state.dataset == 'svhn': self.train_ddm = SVHN(which_set='splitted_train') self.test_ddm = SVHN(which_set='test') self.valid_ddm = SVHN(which_set='valid') elif self.state.dataset == 'cifar10': self.train_ddm = My_CIFAR10(which_set='train', one_hot=True) self.test_ddm = None self.valid_ddm = My_CIFAR10(which_set='test', one_hot=True) if self.train_ddm is not None: self.nvis = self.train_ddm.X.shape[1] self.nout = self.train_ddm.y.shape[1] print "nvis, nout :", self.nvis, self.nout self.ntrain = self.train_ddm.X.shape[0] print "ntrain :", self.ntrain if self.valid_ddm is not None: self.nvalid = self.valid_ddm.X.shape[0] print "nvalid :", self.nvalid if self.test_ddm is not None: self.ntest = self.test_ddm.X.shape[0] print "ntest :", self.ntest def load_model(self): model_class = self.state.model_class fn = getattr(self, 'get_model_'+model_class) self.model = fn() return self.model def get_model_mlp(self): self.dropout = False self.input_include_probs = {} self.input_scales = {} self.weight_decay = False self.weight_decays = {} self.l1_weight_decay = False self.l1_weight_decays = {} nnet_layers = self.state.layers input_space_id = self.state.input_space_id nvis = self.nvis self.batch_size = self.state.batch_size # TODO: add input_space as a config option. input_space = None # TODO: top_view always False for the moment. self.topo_view = False assert nvis is not None layers = [] for i,layer in enumerate(nnet_layers.values()): layer = expand(layer) layer = self.get_layer(layer, i) layers.append(layer) # create MLP: print layers model = My_MLP(layers=layers,input_space=input_space,nvis=nvis, batch_size=self.batch_size) self.mlp = model return model def get_layer(self, layer, layer_id): layer_class = layer.layer_class layer_name = layer.layer_name dropout_scale = layer.dropout_scale dropout_prob = layer.dropout_probability weight_decay = layer.weight_decay l1_weight_decay = layer.l1_weight_decay fn = getattr(self, 'get_layer_'+layer_class) if layer_name is None: layer_name = layer_class+str(layer_id) layer.layer_name = layer_name layer = fn(layer) # per-layer cost function parameters: if (dropout_scale is not None): self.dropout = True self.input_scales[layer_name] = dropout_scale if (dropout_prob is not None): self.dropout = True self.input_include_probs[layer_name] = (1. - dropout_prob) if (weight_decay is not None): self.weight_decay = False self.weight_decays[layer_name] = weight_decay if (l1_weight_decay is not None): self.l1_weight_decay = False self.l1_weight_decays[layer_name] = l1_weight_decay return layer def get_layer_sigmoid(self, layer): return Sigmoid(layer_name=layer.layer_name,dim=layer.dim,irange=layer.irange, istdev=layer.istdev,sparse_init=layer.sparse_init, sparse_stdev=layer.sparse_stdev, include_prob=layer.include_prob, init_bias=layer.init_bias,W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale,max_col_norm=layer.max_col_norm, max_row_norm=layer.max_row_norm) def get_layer_tanh(self, layer): return My_Tanh(layer_name=layer.layer_name,dim=layer.dim,irange=layer.irange, istdev=layer.istdev,sparse_init=layer.sparse_init, sparse_stdev=layer.sparse_stdev, include_prob=layer.include_prob, init_bias=layer.init_bias,W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale,max_col_norm=layer.max_col_norm, max_row_norm=layer.max_row_norm) def get_layer_rectifiedlinear(self, layer): # TODO: left_slope is set to 0.0 It should be set by the user! layer.left_slope = 0.0 return RectifiedLinear(layer_name=layer.layer_name,dim=layer.dim,irange=layer.irange, istdev=layer.istdev,sparse_init=layer.sparse_init, sparse_stdev=layer.sparse_stdev, include_prob=layer.include_prob, init_bias=layer.init_bias,W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale,max_col_norm=layer.max_col_norm, max_row_norm=layer.max_row_norm, left_slope=layer.left_slope,use_bias=layer.use_bias) def get_layer_softmax(self, layer): return My_Softmax(layer_name=layer.layer_name,n_classes=layer.dim,irange=layer.irange, istdev=layer.istdev,sparse_init=layer.sparse_init, init_bias_target_marginals=layer.init_bias, W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale, max_col_norm=layer.max_col_norm, max_row_norm=layer.max_row_norm) def get_layer_noisyRELU(self, layer): return NoisyRELU( dim=layer.dim, layer_name=layer.layer_name, irange=layer.irange, sparse_init=layer.sparse_init, W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale, mask_weights = None, max_row_norm=layer.max_row_norm, max_col_norm=layer.max_col_norm, use_bias=True, noise_factor=layer.noise_factor, desired_active_rate=layer.desired_active_rate, adjust_threshold_factor=layer.adjust_threshold_factor ) def get_layer_gaussianRELU(self, layer): return GaussianRELU( dim=layer.dim, layer_name=layer.layer_name, irange=layer.irange, sparse_init=layer.sparse_init, W_lr_scale=layer.W_lr_scale, b_lr_scale=layer.b_lr_scale, mask_weights = None, max_row_norm=layer.max_row_norm, max_col_norm=layer.max_col_norm, use_bias=True, desired_active_rate=layer.desired_active_rate, adjust_threshold_factor=layer.adjust_threshold_factor, noise_std=layer.noise_std ) def setup_monitor(self): if self.topo_view: print "topo view" self.minibatch = T.as_tensor_variable( self.valid_ddm.get_batch_topo(self.batch_size), name='minibatch' ) else: print "design view" batch = self.valid_ddm.get_batch_design(self.batch_size) if isinstance(batch, spp.csr_matrix): print "sparse2" self.minibatch = self.model.get_input_space().make_batch_theano() print type(self.minibatch) else: self.minibatch = T.as_tensor_variable( self.valid_ddm.get_batch_design(self.batch_size), name='minibatch' ) self.target = T.matrix('target') self.monitor = Monitor.get_monitor(self.model) self.log_channel_names = [] self.log_channel_names.extend(self.base_channel_names) # self.monitor.add_dataset(self.valid_ddm, self.state.train_iteration_mode, # self.batch_size) # if self.test_ddm is not None: # self.monitor.add_dataset(self.test_ddm, self.state.train_iteration_mode, # self.batch_size) def get_train(self): train_class = self.state.train_class fn = getattr(self, 'get_train_'+train_class) return fn() def get_train_sgd(self): cost = MethodCost('cost_from_X') #cost = self.get_costs() num_train_batch = (self.ntrain/self.batch_size) print "num training batches:", num_train_batch termination_criterion = self.get_terminations() monitoring_dataset = {} for dataset_id in self.state.monitoring_dataset: if dataset_id == 'test' and self.test_ddm is not None: monitoring_dataset['test'] = self.test_ddm elif dataset_id == 'valid' and self.valid_ddm is not None: monitoring_dataset['valid'] = self.valid_ddm else: monitoring_dataset = None return SGD( learning_rate=self.state.learning_rate, batch_size=self.state.batch_size, cost=cost, batches_per_iter=num_train_batch, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, init_momentum=self.state.init_momentum, train_iteration_mode=self.state.train_iteration_mode) def get_terminations(self): if 'term_array' not in self.state: return None terminations = [] for term_obj in self.state.term_array.values(): fn = getattr(self, 'get_term_' + term_obj.term_class) terminations.append(fn(term_obj)) if len(terminations) > 1: return And(terminations) return terminations[0] def get_term_epochcounter(self, term_obj): return EpochCounter(term_obj.max_epochs) def get_term_monitorbased(self, term_obj): print 'monitor_based' return MonitorBased( prop_decrease=term_obj.proportional_decrease, N=term_obj.max_epochs, channel_name=term_obj.channel_name ) def get_extensions(self): if 'ext_array' not in self.state: return [] extensions = [] for ext_obj in self.state.ext_array.values(): fn = getattr(self, 'get_ext_' + ext_obj.ext_class) extensions.append(fn(ext_obj)) # monitor based save best print 'save best channel', self.mbsb_channel_name if self.mbsb_channel_name is not None: self.save_path = self.save_prefix + str(self.state.config_id) + "_optimum.pkl" extensions.append(MonitorBasedSaveBest( channel_name = self.mbsb_channel_name, save_path = self.save_path ) ) return extensions def get_ext_exponentialdecayoverepoch(self, ext_obj): return ExponentialDecayOverEpoch( decay_factor=ext_obj.decay_factor, min_lr_scale=ext_obj.min_lr_scale ) def get_ext_momentumadjustor(self, ext_obj): return MomentumAdjustor( final_momentum=ext_obj.final_momentum, start=ext_obj.start_epoch, saturate=ext_obj.saturate_epoch )
os.makedirs(os.path.join(local_path, 'h5')) for d_set in [train_name, valid_name, test_name]: if not os.path.isfile(os.path.join(local_path, d_set)): logging.info("Copying data from {0} to {1}".format(os.path.join(local_path, d_set), local_path)) shutil.copyfile(os.path.join(orig_path, d_set), os.path.join(local_path, d_set)) def check_dtype(data): if str(data.X.dtype) != config.floatX: logging.warning("The dataset is saved as {}, changing theano's floatX "\ "to the same dtype".format(data.X.dtype)) config.floatX = str(data.X.dtype) # Load train data train = SVHN('splitted_train', path=local_path) check_dtype(train) # prepare preprocessing pipeline = preprocessing.Pipeline() # without batch_size there is a high chance that you might encounter memory error # or pytables crashes pipeline.items.append(preprocessing.GlobalContrastNormalization(batch_size=5000)) pipeline.items.append(preprocessing.LeCunLCN((32,32))) # apply the preprocessings to train train.apply_preprocessor(pipeline, can_fit=True) del train # load and preprocess valid valid = SVHN('valid', path=local_path)
LR_decay = (LR_fin/LR_start)**(1./num_epochs) print("LR_decay = "+str(LR_decay)) # BTW, LR decay might good for the BN moving average... # for SVHN, depending on available CPU memory # 1, 2, 4, 7 or 14 shuffle_parts = 1 # shuffle_parts = 2 # does not work on bart5 # shuffle_parts = 4 # seems to work on bart5 # shuffle_parts = 7 # just to be safe print("shuffle_parts = "+str(shuffle_parts)) print('Loading SVHN dataset') train_set = SVHN( which_set= 'splitted_train', axes= ['b', 'c', 0, 1]) valid_set = SVHN( which_set= 'valid', axes= ['b', 'c', 0, 1]) test_set = SVHN( which_set= 'test', axes= ['b', 'c', 0, 1]) # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_set.X = np.reshape(np.subtract(np.multiply(2./255.,train_set.X),1.),(-1,3,32,32)) valid_set.X = np.reshape(np.subtract(np.multiply(2./255.,valid_set.X),1.),(-1,3,32,32))
def main(method, LR_start, Binarize_weight_only): name = "svhn" print("dataset = " + str(name)) print("Binarize_weight_only=" + str(Binarize_weight_only)) print("Method = " + str(method)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters batch_size = 50 print("batch_size = " + str(batch_size)) num_epochs = 50 print("num_epochs = " + str(num_epochs)) print("LR_start = " + str(LR_start)) LR_decay = 0.1 print("LR_decay=" + str(LR_decay)) # BTW, LR decay might good for the BN moving average... if Binarize_weight_only == "w": activation = lasagne.nonlinearities.rectify else: activation = lab.binary_tanh_unit print("activation = " + str(activation)) ## number of filters in the first convolutional layer K = 64 print("K=" + str(K)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) # 128C3-128C3-P2 l_cnn1 = lab.Conv2DLayer(l_in, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha) l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation) l_cnn2 = lab.Conv2DLayer(l_nl1, num_filters=K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2)) l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha) l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation) # 256C3-256C3-P2 l_cnn3 = lab.Conv2DLayer(l_nl2, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha) l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation) l_cnn4 = lab.Conv2DLayer(l_nl3, num_filters=2 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2)) l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha) l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation) # 512C3-512C3-P2 l_cnn5 = lab.Conv2DLayer(l_nl4, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha) l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation) l_cnn6 = lab.Conv2DLayer(l_nl5, num_filters=4 * K, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity, method=method) l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2)) l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha) l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation) # print(cnn.output_shape) # 1024FP-1024FP-10FP l_dn1 = lab.DenseLayer(l_nl6, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha) l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation) l_dn2 = lab.DenseLayer(l_nl7, nonlinearity=lasagne.nonlinearities.identity, num_units=1024, method=method) l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha) l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation) l_dn3 = lab.DenseLayer(l_nl8, nonlinearity=lasagne.nonlinearities.identity, num_units=10, method=method) l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if method != "FPN": # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = lab.compute_grads(loss, l_out) updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = lab.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + optimizer.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) ## update 2nd moment, can get from the adam optimizer also updates3 = OrderedDict() acc_tag = lasagne.layers.get_all_params(l_out, acc=True) idx = 0 beta2 = 0.999 for acc_tag_temp in acc_tag: updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[ idx] * W_grads[idx] * (1 - beta2) idx = idx + 1 updates = OrderedDict(updates.items() + updates3.items()) else: params = lasagne.layers.get_all_params(l_out, trainable=True) updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(l_out, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) val_fn = theano.function([input, target], [test_loss, test_err]) ## load data print('Loading SVHN dataset') train_set = SVHN( which_set='splitted_train', # which_set= 'valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) valid_set = SVHN(which_set='valid', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) test_set = SVHN(which_set='test', path="${SVHN_LOCAL_PATH}", axes=['b', 'c', 0, 1]) # bc01 format # print train_set.X.shape train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32)) valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32)) test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32)) train_set.y = np.array(train_set.y).flatten() valid_set.y = np.array(valid_set.y).flatten() test_set.y = np.array(test_set.y).flatten() # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. print('Training...') # ipdb.set_trace() lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y, test_set.X, test_set.y)