def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL10-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print("Preparing output directory...") patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_stl10_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2*1000*1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_dataset_cifar10(): """ The orginal pipeline on cifar10 from pylearn2. Please refer to pylearn2/scripts/train_example/make_dataset.py for details. """ train_path = 'cifar10_preprocessed_train.pkl' test_path = 'cifar10_preprocessed_test.pkl' if os.path.exists(train_path) and \ os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train") testset = cifar10.CIFAR10(which_set="test") print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) trainset.use_design_loc('train_design.npy') testset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.use_design_loc('test_design.npy') print 'saving preprocessed data...' serial.save('cifar10_preprocessed_train.pkl', trainset) serial.save('cifar10_preprocessed_test.pkl', testset) trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path # this path will be used for visualizing weights after training is done #global YAML return trainset, testset
to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8,8),num_patches=150000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor = pipeline, can_fit = True) test.apply_preprocessor(preprocessor = pipeline, can_fit = False) train.use_design_loc('/data/lisatmp/goodfeli/cifar10_preprocessed_train_design.npy') test.use_design_loc('/data/lisatmp/goodfeli/cifar10_preprocessed_test_design.npy') serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_train.pkl',train) serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_test.pkl',test)
from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing import os goodfeli_tmp = os.environ['GOODFELI_TMP'] train = cifar10.CIFAR10(which_set="train") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2000000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) print 'processing test set' test.apply_preprocessor(preprocessor=pipeline, can_fit=False) print 'saving' train.use_design_loc(goodfeli_tmp + '/cifar10_preprocessed_train_2M_6x6_design.npy') test.use_design_loc(goodfeli_tmp + '/cifar10_preprocessed_test_2M_6x6_design.npy') serial.save(goodfeli_tmp + '/cifar10_preprocessed_train_2M_6x6.pkl', train) print 'done saving train' serial.save(goodfeli_tmp + '/cifar10_preprocessed_test_2M_6x6.pkl', test)
patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """) README.close() print "Preprocessing the data..." pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(6,6),num_patches=2*1000*1000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor = pipeline, can_fit = True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl',data) serial.save(patch_dir + '/preprocessor.pkl',pipeline)