def setUp(self): skip_if_no_data() self.train = TFD(which_set='train') self.test = TFD(which_set='test') valid = TFD(which_set='valid') unlabeled = TFD(which_set='unlabeled') full_train = TFD(which_set='full_train') large = TFD(which_set='test', image_size=96) fold1 = TFD(which_set='test', fold=1) fold2 = TFD(which_set='test', fold=2) fold3 = TFD(which_set='test', fold=3) fold4 = TFD(which_set='test', fold=4)
def __init__(self, start=None, stop=None, shuffle=False, rng=None, seed=132987, center=False, scale=False, axes=('b', 0, 1, 'c'), preprocessor=None, which_ds='kaggle'): data_x, data_y = self.load_data(which=which_ds, center=center, scale=scale) tfd = TFD('train', one_hot=1, scale=scale) data_x = np.concatenate((data_x, tfd.X)) data_y = np.concatenate((data_y, tfd.y)) tfd = TFD('valid', one_hot=1, scale=scale) data_x = np.concatenate((data_x, tfd.X)) data_y = np.concatenate((data_y, tfd.y)) if shuffle: rng = rng if rng else np.random.RandomState(seed) rand_idx = rng.permutation(len(data_x)) data_x = data_x[rand_idx] data_y = data_y[rand_idx] if start is not None or stop is not None: if start is None: start = 0 else: assert start >= 0 if stop is None: stop = -1 if stop != -1: assert stop > start data_x = data_x[start:stop] data_y = data_y[start:stop] if center: data_x -= 0.5 self.axes = axes view_converter = dense_design_matrix.DefaultViewConverter((48, 48, 1), axes) super(GoogleTFDDataset, self).__init__(X=data_x, y=data_y, view_converter=view_converter) assert not np.any(np.isnan(self.X)) if preprocessor is not None: preprocessor.apply(self)
def test_iterator(self): # Tests that batches returned by an iterator with topological # data_specs are the same as the ones returned by calling # get_topological_view on the dataset with the corresponding order test = TFD(which_set='test') batch_size = 100 b01c_X = test.X[0:batch_size, :] b01c_topo = test.get_topological_view(b01c_X) b01c_b01c_it = test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) b01c_b01c = b01c_b01c_it.next() assert np.all(b01c_topo == b01c_b01c) c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b_topo = c01b_test.get_topological_view(c01b_X) c01b_c01b_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) c01b_c01b = c01b_c01b_it.next() assert np.all(c01b_topo == c01b_c01b) # Also check that samples from iterators with the same data_specs # with Conv2DSpace do not depend on the axes of the dataset b01c_c01b_it = test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) b01c_c01b = b01c_c01b_it.next() assert np.all(b01c_c01b == c01b_c01b) c01b_b01c_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) c01b_b01c = c01b_b01c_it.next() assert np.all(c01b_b01c == b01c_b01c)
def test_iterator(self): # Tests that batches returned by an iterator with topological # data_specs are the same as the ones returned by calling # get_topological_view on the dataset with the corresponding order batch_size = 100 b01c_X = self.test.X[0:batch_size, :] b01c_topo = self.test.get_topological_view(b01c_X) b01c_b01c_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) b01c_b01c = b01c_b01c_it.next() assert np.all(b01c_topo == b01c_b01c) c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b_topo = c01b_test.get_topological_view(c01b_X) c01b_c01b_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) c01b_c01b = c01b_c01b_it.next() assert np.all(c01b_topo == c01b_c01b) # Also check that samples from iterators with the same data_specs # with Conv2DSpace do not depend on the axes of the dataset b01c_c01b_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) b01c_c01b = b01c_c01b_it.next() assert np.all(b01c_c01b == c01b_c01b) c01b_b01c_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) c01b_b01c = c01b_b01c_it.next() assert np.all(c01b_b01c == b01c_b01c)
def test_load(self): TFD(which_set='valid') TFD(which_set='unlabeled') TFD(which_set='full_train') TFD(which_set='test', image_size=96) TFD(which_set='test', fold=1) TFD(which_set='test', fold=2) TFD(which_set='test', fold=3) TFD(which_set='test', fold=4)
def get_valid(ds, limit_size=-1, fold=0): if ds == 'mnist': data = MNIST('train', start=50000, stop=60000) return data.X[:limit_size] elif ds == 'tfd': data = TFD('valid', fold=fold, scale=True) return data.X else: raise ValueError("Unknow dataset: {}".format(args.dataet))
def test_topo_c01b(self): """ Tests that a topological batch with axes ('c',0,1,'b') can be dimshuffled back to match the standard ('b',0,1,'c') format. """ batch_size = 100 c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b = c01b_test.get_topological_view(c01b_X) assert c01b.shape == (1, 48, 48, batch_size) b01c = c01b.transpose(3, 1, 2, 0) b01c_X = self.test.X[0:batch_size, :] assert c01b_X.shape == b01c_X.shape assert np.all(c01b_X == b01c_X) b01c_direct = self.test.get_topological_view(b01c_X) assert b01c_direct.shape == b01c.shape assert np.all(b01c_direct == b01c)
def get_valid(ds, limit_size=-1, fold=0): if ds == 'mnist': data = MNIST('train', start=50000, stop=60000) return data.X[:limit_size] elif ds == 'tfd': data = TFD('valid', fold=fold, scale=True) return data.X elif ds == 'lfwcrop': # HACK return LFW( axes=('c', 0, 1, 'b'), gcn=55, lfw_path= '/afs/cs.stanford.edu/u/jgauthie/scr/lfwcrop_color/faces32', filelist_path= '/afs/cs.stanford.edu/u/jgauthie/scr/lfwcrop_color/filelist.dev.ids.txt', embedding_file= '/afs/cs.stanford.edu/u/jgauthie/scr/lfw-lsa/LFW_attributes_30d.npz', img_shape=(3, 32, 32)).X else: raise ValueError("Unknow dataset: {}".format(args.dataet))
def test_topo(self): """Tests that a topological batch has 4 dimensions""" train = TFD(which_set='train') topo = train.get_batch_topo(1) assert topo.ndim == 4
pipeline.items.append(preprocessing.RemoveMean(axis=0)) pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(14,14), num_patches=5*1000*1000)) #### Build full-sized image dataset. #### print "Preparing output directory for unlabeled patches..." outdir = data_dir + '/tfd_lcn_v1' serial.mkdir(outdir) README = open('README','w') README.write(""" File generated from hossrbm/scripts/tfd/make_tfd_lcn.py. """) README.close() print 'Loading TFD unlabeled dataset...' print "Preprocessing the data..." data = TFD('unlabeled') data.apply_preprocessor(preprocessor = pipeline, can_fit = True) data.use_design_loc(outdir + '/unlabeled_patches.npy') serial.save(outdir + '/unlabeled_patches.pkl',data) #### For supervised dataset, we work on the full-image dataset #### pipeline.items.pop() #### Build supervised-training datasets #### print "Preparing output directory for supervised data..." for fold_i in xrange(0,5): path = '%s/fold%i' % (outdir, fold_i) serial.mkdir(path) train_data = TFD('train', fold=fold_i, center=False, shuffle=True, seed=37192)
class TestTFD(unittest.TestCase): def setUp(self): skip_if_no_data() self.train = TFD(which_set='train') self.test = TFD(which_set='test') valid = TFD(which_set='valid') unlabeled = TFD(which_set='unlabeled') full_train = TFD(which_set='full_train') large = TFD(which_set='test', image_size=96) fold1 = TFD(which_set='test', fold=1) fold2 = TFD(which_set='test', fold=2) fold3 = TFD(which_set='test', fold=3) fold4 = TFD(which_set='test', fold=4) def test_topo(self): """Tests that a topological batch has 4 dimensions""" topo = self.train.get_batch_topo(1) assert topo.ndim == 4 def test_topo_c01b(self): """ Tests that a topological batch with axes ('c',0,1,'b') can be dimshuffled back to match the standard ('b',0,1,'c') format. """ batch_size = 100 c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b = c01b_test.get_topological_view(c01b_X) assert c01b.shape == (1, 48, 48, batch_size) b01c = c01b.transpose(3, 1, 2, 0) b01c_X = self.test.X[0:batch_size, :] assert c01b_X.shape == b01c_X.shape assert np.all(c01b_X == b01c_X) b01c_direct = self.test.get_topological_view(b01c_X) assert b01c_direct.shape == b01c.shape assert np.all(b01c_direct == b01c) def test_iterator(self): # Tests that batches returned by an iterator with topological # data_specs are the same as the ones returned by calling # get_topological_view on the dataset with the corresponding order batch_size = 100 b01c_X = self.test.X[0:batch_size, :] b01c_topo = self.test.get_topological_view(b01c_X) b01c_b01c_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) b01c_b01c = b01c_b01c_it.next() assert np.all(b01c_topo == b01c_b01c) c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b')) c01b_X = c01b_test.X[0:batch_size, :] c01b_topo = c01b_test.get_topological_view(c01b_X) c01b_c01b_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) c01b_c01b = c01b_c01b_it.next() assert np.all(c01b_topo == c01b_c01b) # Also check that samples from iterators with the same data_specs # with Conv2DSpace do not depend on the axes of the dataset b01c_c01b_it = self.test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('c', 0, 1, 'b')), 'features')) b01c_c01b = b01c_c01b_it.next() assert np.all(b01c_c01b == c01b_c01b) c01b_b01c_it = c01b_test.iterator( mode='sequential', batch_size=batch_size, data_specs=(Conv2DSpace(shape=(48, 48), num_channels=1, axes=('b', 0, 1, 'c')), 'features')) c01b_b01c = c01b_b01c_it.next() assert np.all(c01b_b01c == b01c_b01c)
from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.datasets.tfd import TFD train = TFD(which_set='train') preprocessor = preprocessing.Pipeline() preprocessor.items.append(preprocessing.GlobalContrastNormalization()) preprocessor.items.append(preprocessing.ZCA()) preprocessor.apply(train, can_fit=True) serial.save('tfd_gcn_whitener.pkl', preprocessor)
from dataset import Dataset import theano import theano.tensor as T import numpy from pylearn2.datasets.preprocessing import Standardize, LeCunLCN, GlobalContrastNormalization from pylearn2.datasets.tfd import TFD import pickle as pkl theano.subtensor_merge_bug = False if __name__ == "__main__": weights_file = "../out/pae_mnist_enc_weights.npy" input = T.matrix("X", dtype=theano.config.floatX) tfd_ds = TFD("unlabeled") print(("TFD shape: ", tfd_ds.X.shape)) gcn = GlobalContrastNormalization() standardizer = Standardize() lcn = LeCunLCN(img_shape=(48, 48), channels=[0]) gcn.apply(tfd_ds, can_fit=True) standardizer.apply(tfd_ds, can_fit=True) lcn.apply(tfd_ds) rnd = numpy.random.RandomState(1231) powerup = PowerupAutoencoder(input, nvis=48 * 48, nhid=500, momentum=0.66,