""" This script makes a dataset of two million approximately whitened patches, extracted at random uniformly from the CIFAR-100 train dataset. This script is intended to reproduce the preprocessing used by Adam Coates et. al. in their work from the first half of 2011 on the CIFAR-10 and STL-10 datasets. """ from __future__ import print_function from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.datasets.cifar100 import CIFAR100 from pylearn2.utils import string data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 8x8 approximately whitened, contrast-normalized
This script makes a dataset of two million approximately whitened patches, extracted at random uniformly from a downsampled version of the STL-10 unlabeled and train dataset. It assumes that you have already run make_downsampled_stl10.py, which downsamples the STL-10 images to 1/3 of their original resolution. This script is intended to reproduce the preprocessing used by Adam Coates et. al. in their work from the first half of 2011. It does not contrast-normalize the patches prior to whitening them. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string, serial import numpy as np data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print 'Loading STL10-10 unlabeled and train datasets...' downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print 'Concatenating datasets...' data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print "Preparing output directory..." data_dir = string.preprocess('${GOODFELI_TMP}') patch_dir = data_dir + '/stl10_patches_no_shelling' serial.mkdir(patch_dir)
""" Makes a version of the STL-10 dataset that has been downsampled by a factor of 3 along both axes. """ from pylearn2.datasets.cifar10 import CIFAR10 from pylearn2.utils import string_utils as string print 'Preparing output directory...' data_dir = string.preprocess('/u/kruegerd/repo/current/pylearn2/pylearn2/datasets/cifar10') downsampled_dir = data_dir serial.mkdir( downsampled_dir ) #Unlabeled dataset is huge, so do it in chunks #(After downsampling it should be small enough to work with) final_unlabeled = np.zeros((100*1000,32*32*3),dtype='float32') for i in xrange(10): print 'Loading unlabeled chunk '+str(i+1)+'/10...' unlabeled = STL10(which_set = 'unlabeled', center = True, example_range = (i * 10000, (i+1) * 10000)) print 'Preprocessing unlabeled chunk...' print 'before ',(unlabeled.X.min(),unlabeled.X.max()) unlabeled.apply_preprocessor(preprocessor)
""" This script makes a dataset of 32x32 approximately whitened CIFAR-10 images. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string from pylearn2.datasets.cifar100 import CIFAR100 data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print 'Loading CIFAR-100 train dataset...' train = CIFAR100(which_set='train') print "Preparing output directory..." output_dir = data_dir + '/whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of an approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used