def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL10-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print("Preparing output directory...") patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_stl10_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2*1000*1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_preprocess_gcn(self, preprocess_id): row = self.db.executeSQL( """ SELECT subtract_mean, std_bias, use_norm FROM hps3.preprocess_gcn WHERE preprocess_id = %s """, (preprocess_id, ), self.db.FETCH_ONE) if not row or row is None: raise HPSData("No gcn preprocess for preprocess_id="\ +str(preprocess_id)) (subtract_mean, std_bias, use_norm) = row return pp.GlobalContrastNormalization(subtract_mean=subtract_mean, std_bias=std_bias, use_norm=use_norm)
def get_pipeline(img_shape, patch_size, batch_size): pipeline = preprocessing.Pipeline() conf = get_config() if conf['preprocessing']['remove_mean']: pipeline.items.append(preprocessing.RemoveMean()) if conf['preprocessing']['gcn']: pipeline.items.append( preprocessing.GlobalContrastNormalization(batch_size=batch_size) ) if conf['preprocessing']['lcn']: # LCN requires uneven patch size lcn_patch_size = patch_size + 1 - (patch_size % 2) pipeline.items.append( preprocessing.LeCunLCN( img_shape, kernel_size=lcn_patch_size) ) return pipeline
def main(): train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=True)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
def get_processed_dataset(): train_path = 'pp_cifar10_train.pkl' test_path = 'pp_cifar10_test.pkl' if os.path.exists(train_path) and os.path.exists( test_path) and not new_params: print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatchesWithPosition( patch_shape=patch_shape, patches_per_image=patches_per_image)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append( preprocessing.PCA(num_components=num_components, keep_var_fraction=keep_var_fraction)) pipeline.items.append( preprocessing.ExtractPatchPairs( patches_per_image=patches_per_image, num_images=train_size, input_width=input_width)) trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop) testset = cifar10.CIFAR10(which_set="test") trainset.preprocessor = pipeline trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # the pkl-ing is having issues, the dataset is maybe too big. serial.save(train_path, trainset) serial.save(test_path, testset) # this path will be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path return trainset, testset
def get_dataset_cifar10(): """ The orginal pipeline on cifar10 from pylearn2. Please refer to pylearn2/scripts/train_example/make_dataset.py for details. """ train_path = 'cifar10_preprocessed_train.pkl' test_path = 'cifar10_preprocessed_test.pkl' if os.path.exists(train_path) and \ os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train") testset = cifar10.CIFAR10(which_set="test") print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) trainset.use_design_loc('train_design.npy') testset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.use_design_loc('test_design.npy') print 'saving preprocessed data...' serial.save('cifar10_preprocessed_train.pkl', trainset) serial.save('cifar10_preprocessed_test.pkl', testset) trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path # this path will be used for visualizing weights after training is done #global YAML return trainset, testset
def generate_patches(): datasets = OrderedDict() datasets['train'] = GenderWrite.gwdata.GWData(which_set='train', start=1, stop=201) datasets['valid'] = GenderWrite.gwdata.GWData(which_set='train', start=201, stop=283) datasets['test'] = GenderWrite.gwdata.GWData(which_set='test') datasets['tottrain'] = GenderWrite.gwdata.GWData(which_set='train') # preprocess patches pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) for dstr, dset in datasets.iteritems(): print dstr # only fit on train data trainbool = dstr == 'train' or dstr == 'tottrain' dset.apply_preprocessor(preprocessor=pipeline, can_fit=trainbool) # save dset.use_design_loc(DATA_DIR + dstr + '_design.npy') serial.save(DATA_DIR + 'gw_preprocessed_' + dstr + '.pkl', dset)
from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000)) pipeline.items.append(preprocessing.GlobalContrastNormalization()) pipeline.items.append(preprocessing.ZCA()) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) train.use_design_loc( '/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M_design.npy') test.use_design_loc( '/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M_design.npy') serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M.pkl', train) serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M.pkl', test)
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, std_bias=0.0)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
def generate(opc): """ Summary (Generates a dataset with the chosen transformation). Parameters ---------- opc: string Only two options, shifts or rotations. """ dim = 19 # outer square # A bigger image is used to avoid empty pixels in the # borders. reg = 13 # inner square total = 20000 # Number of training examples im1 = numpy.zeros((total, reg, reg, 1), dtype='float32') im2 = numpy.zeros((total, reg, reg, 1), dtype='float32') Y = numpy.zeros((total, 1), dtype='uint8') rng = make_np_rng(9001, [1, 2, 3], which_method="uniform") transformation = opc if transformation == 'shifts': # Shifts # only shifts between [-3, +3] pixels shifts = list(itertools.product(range(-3, 4), range(-3, 4))) t = 0 while t < total: x = rng.uniform(0, 1, (dim, dim)) x = numpy.ceil(x * 255) im_x = x[3:16, 3:16][:, :, None] ind = rng.randint(0, len(shifts)) Y[t] = ind txy = shifts[ind] tx, ty = txy im_y = x[(3 + tx):(16 + tx), (3 + ty):(16 + ty)][:, :, None] im1[t, :] = im_x im2[t, :] = im_y t += 1 else: assert transformation == 'rotations' # Rotations import Image # import cv2 angs = numpy.linspace(0, 359, 90) t = 0 while t < total: x = rng.uniform(0, 1, (dim, dim)) x = numpy.ceil(x * 255) im_x = x[3:16, 3:16][:, :, None] ind = rng.randint(0, len(angs)) Y[t] = ind ang = angs[ind] y = numpy.asarray(Image.fromarray(x).rotate(ang)) # scale = 1 # M1 = cv2.getRotationMatrix2D((dim/2, dim/2), ang, scale) # y = cv2.warpAffine(x, M1, (dim, dim)) im_y = y[3:16, 3:16][:, :, None] im1[t, :] = im_x im2[t, :] = im_y t += 1 view_converter = dense_design_matrix.DefaultViewConverter((reg, reg, 1)) design_X = view_converter.topo_view_to_design_mat(im1) design_Y = view_converter.topo_view_to_design_mat(im2) # Normalize data: pipeline = preprocessing.Pipeline() gcn = preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True) pipeline.items.append(gcn) XY = numpy.concatenate((design_X, design_Y), 0) XY_ImP = dense_design_matrix.DenseDesignMatrix(X=XY) XY_ImP.apply_preprocessor(preprocessor=pipeline, can_fit=True) X1 = XY_ImP.X[0:design_X.shape[0], :] X2 = XY_ImP.X[design_X.shape[0]:, :] # As a Conv2DSpace topo_X1 = view_converter.design_mat_to_topo_view(X1) topo_X2 = view_converter.design_mat_to_topo_view(X2) axes = ('b', 0, 1, 'c') data_specs = (CompositeSpace([ Conv2DSpace((reg, reg), num_channels=1, axes=axes), Conv2DSpace((reg, reg), num_channels=1, axes=axes), VectorSpace(1) ]), ('featuresX', 'featuresY', 'targets')) train = VectorSpacesDataset((topo_X1, topo_X2, Y), data_specs=data_specs) # As a VectorSpace # data_specs = (CompositeSpace( # [VectorSpace(reg * reg), # VectorSpace(reg * reg), # VectorSpace(1)]), # ('featuresX', 'featuresY', 'targets')) # train = VectorSpacesDataset(data=(X1, X2, Y), data_specs=data_specs) import os save_path = os.path.dirname(os.path.realpath(__file__)) serial.save(os.path.join(save_path, 'train_preprocessed.pkl'), train)
if str(data.X.dtype) != config.floatX: logging.warning("The dataset is saved as {}, changing theano's floatX " \ "to the same dtype".format(data.X.dtype)) config.floatX = str(data.X.dtype) # Load train data train = SVHN('splitted_train', path=local_path) check_dtype(train) # prepare preprocessing pipeline = preprocessing.Pipeline() # without batch_size there is a high chance that you might encounter memory error # or pytables crashes pipeline.items.append( preprocessing.GlobalContrastNormalization(batch_size=5000)) pipeline.items.append(preprocessing.LeCunLCN((32, 32))) # apply the preprocessings to train train.apply_preprocessor(pipeline, can_fit=True) del train # load and preprocess valid valid = SVHN('valid', path=local_path) check_dtype(valid) valid.apply_preprocessor(pipeline, can_fit=False) # load and preprocess test test = SVHN('test', path=local_path) check_dtype(test) test.apply_preprocessor(pipeline, can_fit=False)
def get_dataset(tot=False, preprocessor='normal'): if not os.path.exists(DATA_DIR+'train.npy') or \ not os.path.exists(DATA_DIR+'test.npy') or \ not os.path.exists(DATA_DIR+'targets.npy'): initial_read() train_path = DATA_DIR+'train_'+preprocessor+'_preprocessed.pkl' valid_path = DATA_DIR+'valid_'+preprocessor+'_preprocessed.pkl' tottrain_path = DATA_DIR+'tottrain_'+preprocessor+'_preprocessed.pkl' test_path = DATA_DIR+'test_'+preprocessor+'_preprocessed.pkl' if os.path.exists(train_path) and os.path.exists(valid_path) and os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) validset = serial.load(valid_path) if tot: tottrainset = serial.load(tottrain_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = Digits(which_set='train', start=0, stop=34000) validset = Digits(which_set='train', start=34000, stop=42000) tottrainset = Digits(which_set='train') testset = Digits(which_set='test') print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) if preprocessor != 'nozca': # ZCA = zero-phase component analysis # very similar to PCA, but preserves the look of the original image better pipeline.items.append(preprocessing.ZCA()) # note the can_fit=False's: no sharing between train and valid data trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) validset.apply_preprocessor(preprocessor=pipeline, can_fit=False) tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) testset.apply_preprocessor(preprocessor=pipeline, can_fit=False) if preprocessor not in ('normal','nozca'): for data in (trainset, validset, tottrainset, testset): for ii in range(data.X.shape[0]): # normalize to [0,1] dmax = np.max(data.X[ii,:]) dmin = np.min(data.X[ii,:]) dnorm = (data.X[ii,:] - dmin) / (dmax - dmin) # and convert to PIL image img = Image.fromarray(dnorm.reshape(28, 28) * 255.).convert('L') # apply preprocessor if preprocessor == 'rotate': rot = rng.randint(-40, 41) img = img.rotate(rot, Image.BILINEAR) elif preprocessor == 'emboss': img = emboss(img) elif preprocessor == 'hshear': # coef = 0 means unsheared coef = -1 + np.random.rand()*2 # note: image is moved with (coef/2)*28 to center it after shearing img = img.transform((28,28), Image.AFFINE, (1,coef,-(coef/2)*28,0,1,0), Image.BILINEAR) elif preprocessor == 'vshear': coef = -1 + np.random.rand()*2 img = img.transform((28,28), Image.AFFINE, (1,0,0,coef,1,-(coef/2)*28), Image.BILINEAR) elif preprocessor == 'patch': # negative values are not possible in PIL, so do a zoom only transform then x1 = np.random.randint(0, 5) y1 = np.random.randint(0, 5) x2 = np.random.randint(0, 5) y2 = np.random.randint(0, 5) img = img.transform((28,28), Image.EXTENT, (x1, y1, 28-x2, 28-y2), Image.BILINEAR) # convert back to numpy array data.X[ii,:] = np.array(img.getdata()) / 255. if preprocessor == 'noisy': # add noise data.X[ii,:] += np.random.randn(28*28) * 0.1 # bound between [0,1] data.X[ii,:] = np.minimum(np.ones(28*28), np.maximum(np.zeros(28*28), data.X[ii,:])) # this uses numpy format for storage instead of pickle, for memory reasons trainset.use_design_loc(DATA_DIR+'train_'+preprocessor+'_design.npy') validset.use_design_loc(DATA_DIR+'valid_'+preprocessor+'_design.npy') tottrainset.use_design_loc(DATA_DIR+'tottrain_'+preprocessor+'_design.npy') testset.use_design_loc(DATA_DIR+'test_'+preprocessor+'_design.npy') # this path can be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path validset.yaml_src = '!pkl: "%s"' % valid_path tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path testset.yaml_src = '!pkl: "%s"' % test_path print 'saving preprocessed data...' serial.save(train_path, trainset) serial.save(valid_path, validset) serial.save(tottrain_path, tottrainset) serial.save(test_path, testset) if tot: return tottrainset, validset, testset else: return trainset, validset, testset
def test_works(): load = True if load == False: ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000) ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049) # valid can_fit = false pipeline = preprocessing.Pipeline() stndrdz = preprocessing.Standardize() stndrdz.apply(ddmTrain, can_fit=True) #doubt, how about can_fit = False? stndrdz.apply(ddmValid, can_fit=False) GCN = preprocessing.GlobalContrastNormalization() GCN.apply(ddmTrain, can_fit=True) GCN.apply(ddmValid, can_fit=False) pcklFile = open('kpd.pkl', 'wb') obj = (ddmTrain, ddmValid) pickle.dump(obj, pcklFile) pcklFile.close() return else: pcklFile = open('kpd.pkl', 'rb') (ddmTrain, ddmValid) = pickle.load(pcklFile) pcklFile.close() #creating layers #2 convolutional rectified layers, border mode valid layer1 = ConvRectifiedLinear(layer_name='convRect1', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) layer2 = ConvRectifiedLinear(layer_name='convRect2', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) # Rectified linear units layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3') #multisoftmax n_groups = 30 n_classes = 98 irange = 0 layer_name = 'multisoftmax' layerMS = MultiSoftmax(n_groups=n_groups, irange=0.05, n_classes=n_classes, layer_name=layer_name) #setting up MLP MLPerc = MLP(batch_size=8, input_space=Conv2DSpace(shape=[96, 96], num_channels=1), layers=[layer1, layer2, layer3, layerMS]) #mlp_cost missing_target_value = -1 mlp_cost = MLPCost(cost_type='default', missing_target_value=missing_target_value) #algorithm # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria term_crit = MonitorBased(prop_decrease=0.00001, N=30, channel_name='validation_objective') kpSGD = KeypointSGD(learning_rate=0.001, init_momentum=0.5, monitoring_dataset={ 'validation': ddmValid, 'training': ddmTrain }, batch_size=8, batches_per_iter=750, termination_criterion=term_crit, train_iteration_mode='random_uniform', cost=mlp_cost) #train extension train_ext = ExponentialDecayOverEpoch(decay_factor=0.998, min_lr_scale=0.01) #train object train = Train(dataset=ddmTrain, save_path='kpd_model2.pkl', save_freq=1, model=MLPerc, algorithm=kpSGD, extensions=[ train_ext, MonitorBasedSaveBest(channel_name='validation_objective', save_path='kpd_best.pkl'), MomentumAdjustor(start=1, saturate=20, final_momentum=.9) ]) train.main_loop() train.save()
# We'd like to do several operations on them, so we'll set up a pipeline to # do so. pipeline = preprocessing.Pipeline() # First we want to pull out small patches of the images, since it's easier # to train an RBM on these pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000) ) # Next we contrast normalize the patches. The default arguments use the # same "regularization" parameters as those used in Adam Coates, Honglak # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in # Unsupervised Feature Learning" pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) # Finally we whiten the data using ZCA. Again, the default parameters to # ZCA are set to the same values as those used in the previously mentioned # paper. pipeline.items.append(preprocessing.ZCA()) # Here we apply the preprocessing pipeline to the dataset. The can_fit # argument indicates that data-driven preprocessing steps (such as the ZCA # step in this example) are allowed to fit themselves to this dataset. # Later we might want to run the same pipeline on the test set with the # can_fit flag set to False, in order to make sure that the same whitening # matrix was used on both datasets. train.apply_preprocessor(preprocessor=pipeline, can_fit=True) # Finally we save the dataset to the filesystem. We instruct the dataset to
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train", center=True) pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=True)) pipeline.items.append(preprocessing.PCA(num_components=512)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) serial.save('cifar10_preprocessed_train.pkl', train) serial.save('cifar10_preprocessed_test.pkl', test)
from pylearn2.utils import serial from pylearn2.datasets import cifar10 from pylearn2.datasets import preprocessing train = cifar10.CIFAR10(which_set="train") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(std_bias=0.0, use_norm=1.)) test = cifar10.CIFAR10(which_set="test") train.apply_preprocessor(preprocessor=pipeline, can_fit=True) test.apply_preprocessor(preprocessor=pipeline, can_fit=False) train.use_design_loc( '/data/lisatmp/goodfeli/cifar10_sphere_train_2M_design.npy') test.use_design_loc('/data/lisatmp/goodfeli/cifar10_sphere_test_2M_design.npy') serial.save('/data/lisatmp/goodfeli/cifar10_sphere_train_2M.pkl', train) serial.save('/data/lisatmp/goodfeli/cifar10_sphere_test_2M.pkl', test) train = serial.load('/data/lisatmp/goodfeli/cifar10_sphere_train_2M.pkl')
to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def get_data(tot=True, flatgrey=False): tottrain_path = DATA_DIR+'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl' test_path = DATA_DIR+'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl' if os.path.exists(test_path): print 'loading preprocessed data' datasets = OrderedDict() # datasets['train'] = serial.load(train_path) # datasets['valid'] = serial.load(valid_path) if tot: datasets['tottrain'] = serial.load(tottrain_path) datasets['test'] = serial.load(test_path) if tot: return datasets['tottrain'], datasets['test'] else: return datasets['train'], datasets['test'] else: print 'preprocessing data...' pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization(use_std=True)) pipeline.items.append(preprocessing.ZCA()) # print 'traindata' # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=0, stop=39999) # data.apply_preprocessor(preprocessor=pipeline, can_fit=True) # # this path can be used for visualizing weights after training is done # data.yaml_src = '!pkl: "%s"' % data # # save # data.use_design_loc(DATA_DIR+'train_design' + str(SUBMODEL) + '.npy') # serial.save(DATA_DIR+'gz_preprocessed_train'+str(SUBMODEL) + '.pkl', data) # print 'validdata' # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=40000, stop=61577) # data.apply_preprocessor(preprocessor=pipeline, can_fit=False) # # this path can be used for visualizing weights after training is done # data.yaml_src = '!pkl: "%s"' % data # # save # data.use_design_loc(DATA_DIR+'valid_design' + str(SUBMODEL) + '.npy') # serial.save(DATA_DIR+'gz_preprocessed_valid'+str(SUBMODEL) + '.pkl', data) print 'tottraindata' data = GalaxyZoo.gzdeepdata.GZData(which_set='training', flatgrey=flatgrey) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) # this path can be used for visualizing weights after training is done data.yaml_src = '!pkl: "%s"' % data # save data.use_design_loc(DATA_DIR + 'tottrain_design' + str(SUBMODEL) + '_64x.npy') serial.save(DATA_DIR + 'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl', data) print 'testdata' data = GalaxyZoo.gzdeepdata.GZData(which_set='test', flatgrey=flatgrey) data.apply_preprocessor(preprocessor=pipeline, can_fit=False) # this path can be used for visualizing weights after training is done data.yaml_src = '!pkl: "%s"' % data # save data.use_design_loc(DATA_DIR + 'test_design' + str(SUBMODEL) + '_64x.npy') serial.save(DATA_DIR + 'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl', data) print 'Finished, now re-run for running model on GPU' return None, None