def get_processed_dataset(): train_path = 'pp_cifar10_train.pkl' test_path = 'pp_cifar10_test.pkl' if os.path.exists(train_path) and os.path.exists( test_path) and not new_params: print 'loading preprocessed data' trainset = serial.load(train_path) testset = serial.load(test_path) else: print 'loading raw data...' pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatchesWithPosition( patch_shape=patch_shape, patches_per_image=patches_per_image)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append( preprocessing.PCA(num_components=num_components, keep_var_fraction=keep_var_fraction)) pipeline.items.append( preprocessing.ExtractPatchPairs( patches_per_image=patches_per_image, num_images=train_size, input_width=input_width)) trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop) testset = cifar10.CIFAR10(which_set="test") trainset.preprocessor = pipeline trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # the pkl-ing is having issues, the dataset is maybe too big. serial.save(train_path, trainset) serial.save(test_path, testset) # this path will be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path testset.yaml_src = '!pkl: "%s"' % test_path return trainset, testset
patch_width = 5 num_channels = 3 #num_components = patch_width**2 * num_channels num_components = 20 keep_var_fraction = 1e4 patches_per_image = 4 # We'd like to do several operations on them, so we'll set up a pipeline to # do so. pipeline = preprocessing.Pipeline() # First we want to pull out small patches of the images, since it's easier # to train an RBM on these pipeline.items.append( preprocessing.ExtractPatchesWithPosition(patch_shape=(patch_width, patch_width), patches_per_image = patches_per_image) ) # Next we contrast normalize the patches. The default arguments use the # same "regularization" parameters as those used in Adam Coates, Honglak # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in # Unsupervised Feature Learning" pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) # Finally we whiten the data using ZCA. Again, the default parameters to # ZCA are set to the same values as those used in the previously mentioned # paper. pipeline.items.append(preprocessing.PCA(num_components = num_components, keep_var_fraction = keep_var_fraction)) pipeline.items.append(preprocessing.ExtractPatchPairs(patches_per_image = patches_per_image, num_images = num_images)
# We'll need the preprocessing module to preprocess the dataset from pylearn2.datasets import preprocessing if __name__ == "__main__": # Our raw training set is 32x32 color images train = cifar10.CIFAR10(which_set="train") # We'd like to do several operations on them, so we'll set up a pipeline to # do so. pipeline = preprocessing.Pipeline() # First we want to pull out small patches of the images, since it's easier # to train an RBM on these pipeline.items.append( preprocessing.ExtractPatchesWithPosition(patch_shape=(8, 8), patches_per_image=3)) # Next we contrast normalize the patches. The default arguments use the # same "regularization" parameters as those used in Adam Coates, Honglak # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in # Unsupervised Feature Learning" pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) # Finally we whiten the data using ZCA. Again, the default parameters to # ZCA are set to the same values as those used in the previously mentioned # paper. pipeline.items.append(preprocessing.PCA(keep_var_fraction=.99)) # Here we apply the preprocessing pipeline to the dataset. The can_fit # argument indicates that data-driven preprocessing steps (such as the ZCA