Exemplo n.º 1
0
 def setUp(self):
     skip_if_no_data()
     self.train = TFD(which_set='train')
     self.test = TFD(which_set='test')
     valid = TFD(which_set='valid')
     unlabeled = TFD(which_set='unlabeled')
     full_train = TFD(which_set='full_train')
     large = TFD(which_set='test', image_size=96)
     fold1 = TFD(which_set='test', fold=1)
     fold2 = TFD(which_set='test', fold=2)
     fold3 = TFD(which_set='test', fold=3)
     fold4 = TFD(which_set='test', fold=4)
Exemplo n.º 2
0
    def __init__(self,
                 start=None,
                 stop=None,
                 shuffle=False,
                 rng=None,
                 seed=132987,
                 center=False,
                 scale=False,
                 axes=('b', 0, 1, 'c'),
                 preprocessor=None,
                 which_ds='kaggle'):

        data_x, data_y = self.load_data(which=which_ds,
                                        center=center,
                                        scale=scale)
        tfd = TFD('train', one_hot=1, scale=scale)
        data_x = np.concatenate((data_x, tfd.X))
        data_y = np.concatenate((data_y, tfd.y))
        tfd = TFD('valid', one_hot=1, scale=scale)
        data_x = np.concatenate((data_x, tfd.X))
        data_y = np.concatenate((data_y, tfd.y))

        if shuffle:
            rng = rng if rng else np.random.RandomState(seed)
            rand_idx = rng.permutation(len(data_x))
            data_x = data_x[rand_idx]
            data_y = data_y[rand_idx]

        if start is not None or stop is not None:
            if start is None:
                start = 0
            else:
                assert start >= 0
            if stop is None:
                stop = -1
            if stop != -1:
                assert stop > start
            data_x = data_x[start:stop]
            data_y = data_y[start:stop]

        if center:
            data_x -= 0.5

        self.axes = axes
        view_converter = dense_design_matrix.DefaultViewConverter((48, 48, 1),
                                                                  axes)
        super(GoogleTFDDataset, self).__init__(X=data_x,
                                               y=data_y,
                                               view_converter=view_converter)
        assert not np.any(np.isnan(self.X))

        if preprocessor is not None:
            preprocessor.apply(self)
Exemplo n.º 3
0
    def test_iterator(self):
        # Tests that batches returned by an iterator with topological
        # data_specs are the same as the ones returned by calling
        # get_topological_view on the dataset with the corresponding order
        test = TFD(which_set='test')
        batch_size = 100
        b01c_X = test.X[0:batch_size, :]
        b01c_topo = test.get_topological_view(b01c_X)
        b01c_b01c_it = test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        b01c_b01c = b01c_b01c_it.next()
        assert np.all(b01c_topo == b01c_b01c)

        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b_topo = c01b_test.get_topological_view(c01b_X)
        c01b_c01b_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        c01b_c01b = c01b_c01b_it.next()
        assert np.all(c01b_topo == c01b_c01b)

        # Also check that samples from iterators with the same data_specs
        # with Conv2DSpace do not depend on the axes of the dataset
        b01c_c01b_it = test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        b01c_c01b = b01c_c01b_it.next()
        assert np.all(b01c_c01b == c01b_c01b)

        c01b_b01c_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        c01b_b01c = c01b_b01c_it.next()
        assert np.all(c01b_b01c == b01c_b01c)
Exemplo n.º 4
0
    def test_iterator(self):
        # Tests that batches returned by an iterator with topological
        # data_specs are the same as the ones returned by calling
        # get_topological_view on the dataset with the corresponding order
        batch_size = 100
        b01c_X = self.test.X[0:batch_size, :]
        b01c_topo = self.test.get_topological_view(b01c_X)
        b01c_b01c_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        b01c_b01c = b01c_b01c_it.next()
        assert np.all(b01c_topo == b01c_b01c)

        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b_topo = c01b_test.get_topological_view(c01b_X)
        c01b_c01b_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        c01b_c01b = c01b_c01b_it.next()
        assert np.all(c01b_topo == c01b_c01b)

        # Also check that samples from iterators with the same data_specs
        # with Conv2DSpace do not depend on the axes of the dataset
        b01c_c01b_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        b01c_c01b = b01c_c01b_it.next()
        assert np.all(b01c_c01b == c01b_c01b)

        c01b_b01c_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        c01b_b01c = c01b_b01c_it.next()
        assert np.all(c01b_b01c == b01c_b01c)
Exemplo n.º 5
0
 def test_load(self):
     TFD(which_set='valid')
     TFD(which_set='unlabeled')
     TFD(which_set='full_train')
     TFD(which_set='test', image_size=96)
     TFD(which_set='test', fold=1)
     TFD(which_set='test', fold=2)
     TFD(which_set='test', fold=3)
     TFD(which_set='test', fold=4)
Exemplo n.º 6
0
def get_valid(ds, limit_size=-1, fold=0):
    if ds == 'mnist':
        data = MNIST('train', start=50000, stop=60000)
        return data.X[:limit_size]
    elif ds == 'tfd':
        data = TFD('valid', fold=fold, scale=True)
        return data.X
    else:
        raise ValueError("Unknow dataset: {}".format(args.dataet))
Exemplo n.º 7
0
 def test_topo_c01b(self):
     """
     Tests that a topological batch with axes ('c',0,1,'b')
     can be dimshuffled back to match the standard ('b',0,1,'c')
     format.
     """
     batch_size = 100
     c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
     c01b_X = c01b_test.X[0:batch_size, :]
     c01b = c01b_test.get_topological_view(c01b_X)
     assert c01b.shape == (1, 48, 48, batch_size)
     b01c = c01b.transpose(3, 1, 2, 0)
     b01c_X = self.test.X[0:batch_size, :]
     assert c01b_X.shape == b01c_X.shape
     assert np.all(c01b_X == b01c_X)
     b01c_direct = self.test.get_topological_view(b01c_X)
     assert b01c_direct.shape == b01c.shape
     assert np.all(b01c_direct == b01c)
Exemplo n.º 8
0
 def test_topo_c01b(self):
     """
     Tests that a topological batch with axes ('c',0,1,'b')
     can be dimshuffled back to match the standard ('b',0,1,'c')
     format.
     """
     batch_size = 100
     c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
     c01b_X = c01b_test.X[0:batch_size, :]
     c01b = c01b_test.get_topological_view(c01b_X)
     assert c01b.shape == (1, 48, 48, batch_size)
     b01c = c01b.transpose(3, 1, 2, 0)
     b01c_X = self.test.X[0:batch_size, :]
     assert c01b_X.shape == b01c_X.shape
     assert np.all(c01b_X == b01c_X)
     b01c_direct = self.test.get_topological_view(b01c_X)
     assert b01c_direct.shape == b01c.shape
     assert np.all(b01c_direct == b01c)
Exemplo n.º 9
0
def get_valid(ds, limit_size=-1, fold=0):
    if ds == 'mnist':
        data = MNIST('train', start=50000, stop=60000)
        return data.X[:limit_size]
    elif ds == 'tfd':
        data = TFD('valid', fold=fold, scale=True)
        return data.X
    elif ds == 'lfwcrop':
        # HACK
        return LFW(
            axes=('c', 0, 1, 'b'),
            gcn=55,
            lfw_path=
            '/afs/cs.stanford.edu/u/jgauthie/scr/lfwcrop_color/faces32',
            filelist_path=
            '/afs/cs.stanford.edu/u/jgauthie/scr/lfwcrop_color/filelist.dev.ids.txt',
            embedding_file=
            '/afs/cs.stanford.edu/u/jgauthie/scr/lfw-lsa/LFW_attributes_30d.npz',
            img_shape=(3, 32, 32)).X
    else:
        raise ValueError("Unknow dataset: {}".format(args.dataet))
Exemplo n.º 10
0
 def test_topo(self):
     """Tests that a topological batch has 4 dimensions"""
     train = TFD(which_set='train')
     topo = train.get_batch_topo(1)
     assert topo.ndim == 4
Exemplo n.º 11
0
pipeline.items.append(preprocessing.RemoveMean(axis=0))
pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(14,14), num_patches=5*1000*1000))

#### Build full-sized image dataset. ####
print "Preparing output directory for unlabeled patches..."
outdir = data_dir + '/tfd_lcn_v1'
serial.mkdir(outdir)
README = open('README','w')
README.write("""
File generated from hossrbm/scripts/tfd/make_tfd_lcn.py.
""")
README.close()

print 'Loading TFD unlabeled dataset...'
print "Preprocessing the data..."
data = TFD('unlabeled')
data.apply_preprocessor(preprocessor = pipeline, can_fit = True)
data.use_design_loc(outdir + '/unlabeled_patches.npy')
serial.save(outdir + '/unlabeled_patches.pkl',data)

#### For supervised dataset, we work on the full-image dataset ####
pipeline.items.pop()

#### Build supervised-training datasets ####
print "Preparing output directory for supervised data..."
for fold_i in xrange(0,5):

    path = '%s/fold%i' % (outdir, fold_i)
    serial.mkdir(path)

    train_data = TFD('train', fold=fold_i, center=False, shuffle=True, seed=37192)
Exemplo n.º 12
0
class TestTFD(unittest.TestCase):
    def setUp(self):
        skip_if_no_data()
        self.train = TFD(which_set='train')
        self.test = TFD(which_set='test')
        valid = TFD(which_set='valid')
        unlabeled = TFD(which_set='unlabeled')
        full_train = TFD(which_set='full_train')
        large = TFD(which_set='test', image_size=96)
        fold1 = TFD(which_set='test', fold=1)
        fold2 = TFD(which_set='test', fold=2)
        fold3 = TFD(which_set='test', fold=3)
        fold4 = TFD(which_set='test', fold=4)

    def test_topo(self):
        """Tests that a topological batch has 4 dimensions"""
        topo = self.train.get_batch_topo(1)
        assert topo.ndim == 4

    def test_topo_c01b(self):
        """
        Tests that a topological batch with axes ('c',0,1,'b')
        can be dimshuffled back to match the standard ('b',0,1,'c')
        format.
        """
        batch_size = 100
        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b = c01b_test.get_topological_view(c01b_X)
        assert c01b.shape == (1, 48, 48, batch_size)
        b01c = c01b.transpose(3, 1, 2, 0)
        b01c_X = self.test.X[0:batch_size, :]
        assert c01b_X.shape == b01c_X.shape
        assert np.all(c01b_X == b01c_X)
        b01c_direct = self.test.get_topological_view(b01c_X)
        assert b01c_direct.shape == b01c.shape
        assert np.all(b01c_direct == b01c)

    def test_iterator(self):
        # Tests that batches returned by an iterator with topological
        # data_specs are the same as the ones returned by calling
        # get_topological_view on the dataset with the corresponding order
        batch_size = 100
        b01c_X = self.test.X[0:batch_size, :]
        b01c_topo = self.test.get_topological_view(b01c_X)
        b01c_b01c_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        b01c_b01c = b01c_b01c_it.next()
        assert np.all(b01c_topo == b01c_b01c)

        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b_topo = c01b_test.get_topological_view(c01b_X)
        c01b_c01b_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        c01b_c01b = c01b_c01b_it.next()
        assert np.all(c01b_topo == c01b_c01b)

        # Also check that samples from iterators with the same data_specs
        # with Conv2DSpace do not depend on the axes of the dataset
        b01c_c01b_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        b01c_c01b = b01c_c01b_it.next()
        assert np.all(b01c_c01b == c01b_c01b)

        c01b_b01c_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        c01b_b01c = c01b_b01c_it.next()
        assert np.all(c01b_b01c == b01c_b01c)
Exemplo n.º 13
0
from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.datasets.tfd import TFD

train = TFD(which_set='train')

preprocessor = preprocessing.Pipeline()
preprocessor.items.append(preprocessing.GlobalContrastNormalization())
preprocessor.items.append(preprocessing.ZCA())

preprocessor.apply(train, can_fit=True)

serial.save('tfd_gcn_whitener.pkl', preprocessor)
Exemplo n.º 14
0
 def setUp(self):
     skip_if_no_data()
     self.train = TFD(which_set='train')
     self.test = TFD(which_set='test')
     valid = TFD(which_set='valid')
     unlabeled = TFD(which_set='unlabeled')
     full_train = TFD(which_set='full_train')
     large = TFD(which_set='test', image_size=96)
     fold1 = TFD(which_set='test', fold=1)
     fold2 = TFD(which_set='test', fold=2)
     fold3 = TFD(which_set='test', fold=3)
     fold4 = TFD(which_set='test', fold=4)
Exemplo n.º 15
0
class TestTFD(unittest.TestCase):
    def setUp(self):
        skip_if_no_data()
        self.train = TFD(which_set='train')
        self.test = TFD(which_set='test')
        valid = TFD(which_set='valid')
        unlabeled = TFD(which_set='unlabeled')
        full_train = TFD(which_set='full_train')
        large = TFD(which_set='test', image_size=96)
        fold1 = TFD(which_set='test', fold=1)
        fold2 = TFD(which_set='test', fold=2)
        fold3 = TFD(which_set='test', fold=3)
        fold4 = TFD(which_set='test', fold=4)

    def test_topo(self):
        """Tests that a topological batch has 4 dimensions"""
        topo = self.train.get_batch_topo(1)
        assert topo.ndim == 4

    def test_topo_c01b(self):
        """
        Tests that a topological batch with axes ('c',0,1,'b')
        can be dimshuffled back to match the standard ('b',0,1,'c')
        format.
        """
        batch_size = 100
        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b = c01b_test.get_topological_view(c01b_X)
        assert c01b.shape == (1, 48, 48, batch_size)
        b01c = c01b.transpose(3, 1, 2, 0)
        b01c_X = self.test.X[0:batch_size, :]
        assert c01b_X.shape == b01c_X.shape
        assert np.all(c01b_X == b01c_X)
        b01c_direct = self.test.get_topological_view(b01c_X)
        assert b01c_direct.shape == b01c.shape
        assert np.all(b01c_direct == b01c)

    def test_iterator(self):
        # Tests that batches returned by an iterator with topological
        # data_specs are the same as the ones returned by calling
        # get_topological_view on the dataset with the corresponding order
        batch_size = 100
        b01c_X = self.test.X[0:batch_size, :]
        b01c_topo = self.test.get_topological_view(b01c_X)
        b01c_b01c_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')), 'features'))
        b01c_b01c = b01c_b01c_it.next()
        assert np.all(b01c_topo == b01c_b01c)

        c01b_test = TFD(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b_topo = c01b_test.get_topological_view(c01b_X)
        c01b_c01b_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')), 'features'))
        c01b_c01b = c01b_c01b_it.next()
        assert np.all(c01b_topo == c01b_c01b)

        # Also check that samples from iterators with the same data_specs
        # with Conv2DSpace do not depend on the axes of the dataset
        b01c_c01b_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('c', 0, 1, 'b')), 'features'))
        b01c_c01b = b01c_c01b_it.next()
        assert np.all(b01c_c01b == c01b_c01b)

        c01b_b01c_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(48, 48),
                                    num_channels=1,
                                    axes=('b', 0, 1, 'c')), 'features'))
        c01b_b01c = c01b_b01c_it.next()
        assert np.all(c01b_b01c == b01c_b01c)
Exemplo n.º 16
0
from dataset import Dataset
import theano
import theano.tensor as T
import numpy

from pylearn2.datasets.preprocessing import Standardize, LeCunLCN, GlobalContrastNormalization
from pylearn2.datasets.tfd import TFD

import pickle as pkl

theano.subtensor_merge_bug = False

if __name__ == "__main__":
    weights_file = "../out/pae_mnist_enc_weights.npy"
    input = T.matrix("X", dtype=theano.config.floatX)
    tfd_ds = TFD("unlabeled")

    print(("TFD shape: ", tfd_ds.X.shape))
    gcn = GlobalContrastNormalization()
    standardizer = Standardize()
    lcn = LeCunLCN(img_shape=(48, 48), channels=[0])
    gcn.apply(tfd_ds, can_fit=True)
    standardizer.apply(tfd_ds, can_fit=True)
    lcn.apply(tfd_ds)

    rnd = numpy.random.RandomState(1231)

    powerup = PowerupAutoencoder(input,
                                 nvis=48 * 48,
                                 nhid=500,
                                 momentum=0.66,