예제 #1
0
    def test_mean_keep_dimensions(self):
        data_set = cifar10.CIFAR10(which_set="train")
        pp = RemoveMean(axis=1)

        data_set.apply_preprocessor(pp, can_fit=True)
        result = data_set.get_design_matrix()

        assert isfinite(result)
예제 #2
0
파일: run.py 프로젝트: capybaralet/current
def get_processed_dataset():

    train_path = 'pp_cifar10_train.pkl'
    test_path = 'pp_cifar10_test.pkl'

    if os.path.exists(train_path) and os.path.exists(
            test_path) and not new_params:
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)

    else:
        print 'loading raw data...'

        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.ExtractPatchesWithPosition(
                patch_shape=patch_shape, patches_per_image=patches_per_image))
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                      use_std=True))
        pipeline.items.append(
            preprocessing.PCA(num_components=num_components,
                              keep_var_fraction=keep_var_fraction))
        pipeline.items.append(
            preprocessing.ExtractPatchPairs(
                patches_per_image=patches_per_image,
                num_images=train_size,
                input_width=input_width))

        trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop)
        testset = cifar10.CIFAR10(which_set="test")

        trainset.preprocessor = pipeline

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)

        # the pkl-ing is having issues, the dataset is maybe too big.
        serial.save(train_path, trainset)
        serial.save(test_path, testset)

        # this path will be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    return trainset, testset
예제 #3
0
def main():
    train = cifar10.CIFAR10(which_set="train", center=True)

    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                                  sqrt_bias=0.0,
                                                  use_std=True))
    pipeline.items.append(preprocessing.PCA(num_components=512))

    test = cifar10.CIFAR10(which_set="test")

    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
    test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

    serial.save('cifar10_preprocessed_train.pkl', train)
    serial.save('cifar10_preprocessed_test.pkl', test)
def get_dataset_cifar10():
    """
    The orginal pipeline on cifar10 from pylearn2. Please refer to
    pylearn2/scripts/train_example/make_dataset.py for details.
    """

    train_path = 'cifar10_preprocessed_train.pkl'
    test_path = 'cifar10_preprocessed_test.pkl'

    if os.path.exists(train_path) and \
            os.path.exists(test_path):
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)
    else:
        print 'loading raw data...'
        trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train")
        testset =  cifar10.CIFAR10(which_set="test")

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        pipeline.items.append(
            preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000))

        pipeline.items.append(preprocessing.GlobalContrastNormalization())

        pipeline.items.append(preprocessing.ZCA())

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        trainset.use_design_loc('train_design.npy')

        testset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.use_design_loc('test_design.npy')

        print 'saving preprocessed data...'
        serial.save('cifar10_preprocessed_train.pkl', trainset)
        serial.save('cifar10_preprocessed_test.pkl', testset)

        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    # this path will be used for visualizing weights after training is done
    #global YAML
    return trainset, testset
예제 #5
0
def get_dataset_cifar10():

    train_path = 'cifar10_train.pkl'
    test_path = 'cifar10_test.pkl'

    if os.path.exists(train_path) and \
            os.path.exists(test_path):
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)

    else:
        print 'loading raw data...'
        trainset = cifar10.CIFAR10(which_set="train", one_hot=True)
        testset = cifar10.CIFAR10(which_set="test", one_hot=True)

        serial.save('cifar10_train.pkl', trainset)
        serial.save('cifar10_test.pkl', testset)

        # this path will be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    return trainset, testset
예제 #6
0
def main():

    # Only the trainset is processed by this function.
    print 'getting preprocessed data to train model'
    pp_trainset, testset = get_processed_dataset()
    # remember to change here when changing datasets
    print 'loading unprocessed data for input displays'
    trainset = cifar10.CIFAR10(which_set="train")

    dmat = trainset.get_design_matrix()
    nvis = dmat.shape[1]

    model = DenoisingAutoencoder(
        corruptor=BinomialCorruptor(corruption_level=0.5),
        nhid=nhid,
        nvis=nvis,
        act_enc='sigmoid',
        act_dec='sigmoid',
        irange=.01)

    algorithm = SGD(
        learning_rate=0.1,
        cost=MeanSquaredReconstructionError(),
        batch_size=1000,
        monitoring_batches=10,
        monitoring_dataset=pp_trainset,
        termination_criterion=EpochCounter(max_epochs=MAX_EPOCHS_UNSUPERVISED),
        update_callbacks=None)

    extensions = None

    trainer = Train(model=model,
                    algorithm=algorithm,
                    save_path='testrun.pkl',
                    save_freq=1,
                    extensions=extensions,
                    dataset=pp_trainset)

    trainer.main_loop()
예제 #7
0
#
# This is also a common use case because often you will want to preprocess
# your data once and then train several models on the preprocessed data.

# We'll need the serial module to save the dataset
from pylearn2.utils import serial

# Our raw dataset will be the CIFAR10 image dataset
from pylearn2.datasets import cifar10

# We'll need the preprocessing module to preprocess the dataset
from pylearn2.datasets import preprocessing

if __name__ == "__main__":
    # Our raw training set is 32x32 color images
    train = cifar10.CIFAR10(which_set="train")

    # We'd like to do several operations on them, so we'll set up a pipeline to
    # do so.
    pipeline = preprocessing.Pipeline()

    # First we want to pull out small patches of the images, since it's easier
    # to train an RBM on these
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)
    )

    # Next we contrast normalize the patches. The default arguments use the
    # same "regularization" parameters as those used in Adam Coates, Honglak
    # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in
    # Unsupervised Feature Learning"
예제 #8
0
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train")

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000))
pipeline.items.append(preprocessing.GlobalContrastNormalization())
pipeline.items.append(preprocessing.ZCA())

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

train.use_design_loc(
    '/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M_design.npy')
test.use_design_loc(
    '/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M_design.npy')

serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M.pkl', train)
serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M.pkl', test)
# end def lcn_2d

if __name__ == '__main__':
    from pylearn2.datasets import cifar10
    import matplotlib.pylab as plt
    from classification import load_initial_data
    from fileop import loadfile
    import copy

    flag_cifar10 = False
    flag_covmat = False

    if flag_cifar10:
        img_shape = (32, 32, 3)
        train = cifar10.CIFAR10(which_set="train", one_hot=True)
        test = cifar10.CIFAR10(which_set="test", one_hot=True)
        X = train.X
        X_test = test.X

    else:
        # use moth data for test
        img_shape = (28, 28, 3)
        config = loadfile('config.yaml')
        X, _, X_test, _ = \
            load_initial_data(data_path=config['data_path'],
                              target_width=config['target_width'],
                              target_height=config['target_height'],
                              flag_rescale=config['flag_rescale'],
                              flag_multiscale=config['flag_multiscale'],
                              detect_width_list=config['detect_width_list'],
예제 #10
0
from kaggle_dataset import kaggle_cifar10
from pylearn2.datasets.preprocessing import Pipeline, ZCA
from pylearn2.datasets.preprocessing import GlobalContrastNormalization
from pylearn2.space import Conv2DSpace
from pylearn2.train import Train
from pylearn2.train_extensions import best_params, window_flip
from pylearn2.utils import serial

trn = kaggle_cifar10('train',
                     one_hot=True,
                     datapath='/home/kkastner/kaggle_data/kaggle-cifar10',
                     max_count=40000,
                     axes=('c', 0, 1, 'b'))

tst = cifar10.CIFAR10('test',
                      toronto_prepro=False,
                      one_hot=True,
                      axes=('c', 0, 1, 'b'))

in_space = Conv2DSpace(shape=(32, 32),
                       num_channels=3,
                       axes=('c', 0, 1, 'b'))

l1 = maxout.MaxoutConvC01B(layer_name='l1',
                           pad=4,
                           tied_b=1,
                           W_lr_scale=.05,
                           b_lr_scale=.05,
                           num_channels=96,
                           num_pieces=2,
                           kernel_shape=(8, 8),
                           pool_shape=(4, 4),
예제 #11
0
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train", center=True)

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                              sqrt_bias=0.0,
                                              use_std=True))
pipeline.items.append(preprocessing.PCA(num_components=512))

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

serial.save('cifar10_preprocessed_train.pkl', train)
serial.save('cifar10_preprocessed_test.pkl', test)
import numpy as np

from pylearn2.utils import serial
from pylearn2.utils import string_utils
from pylearn2.datasets import preprocessing
from pylearn2.datasets import cifar10
import pylearn2.pca as pca

output_dir = string_utils.preprocess(
    '/u/kruegers/repo/current/pylearn2/pylearn2/datasets/cifar10')

print "Preparing output directory..."
serial.mkdir(output_dir)

print 'Loading CIFAR-10 train and test datasets...'
trainset = cifar10.CIFAR10(which_set='train')
testset = cifar10.CIFAR10(which_set='test')

print "Learning the preprocessor"
preprocessor = pca.PCA()

print "Preprocessing the unsupervised train data..."
trainset.apply_preprocessor(preprocessor=preprocessor, can_fit=True)
print 'Saving the unsupervised train data'
trainset.use_design_loc(output_dir + '/train.npy')
serial.save(output_dir + '/train.pkl', trainset)

print "Preprocessing the test data..."
testset.apply_preprocessor(preprocessor=preprocessor, can_fit=False)
print "Saving the test data"
testset.use_design_loc(output_dir + '/test.npy')
예제 #13
0
파일: run.py 프로젝트: capybaralet/current
def main():

    # Only the trainset is processed by this function.
    print 'getting preprocessed data for training model'
    pp_trainset, testset = get_processed_dataset()
    # remember to change here when changing datasets
    print 'loading unprocessed data for input displays'
    trainset = cifar10.CIFAR10(which_set="train")

    dmat = pp_trainset.get_design_matrix()
    nvis = dmat.shape[1]

    model = DenoisingAutoencoder(
        corruptor=BinomialCorruptor(corruption_level=0.3),
        nhid=nhid,
        nvis=nvis,
        act_enc='sigmoid',
        act_dec='sigmoid',
        irange=.01)

    algorithm = SGD(
        learning_rate=learning_rate,
        cost=MeanSquaredReconstructionError(),
        batch_size=100,
        monitoring_batches=10,
        monitoring_dataset=pp_trainset,
        termination_criterion=EpochCounter(max_epochs=MAX_EPOCHS_UNSUPERVISED),
        update_callbacks=None)

    extensions = None

    trainer = Train(model=model,
                    algorithm=algorithm,
                    save_path='run.pkl',
                    save_freq=1,
                    extensions=extensions,
                    dataset=pp_trainset)

    trainer.main_loop()

    ####################
    # Plot and Save:

    # choose random patch-pairs to plot
    stamps = pp_trainset.stamps
    num_examples = stamps.shape[0]
    to_plot = np.random.randint(0, num_examples, num2plot)

    # use to_plot indices to extract data
    stamps_data = stamps[to_plot]
    image_numbers = stamps[to_plot, 0].astype(int)
    X = trainset.X
    images_data = trainset.get_topological_view(X[image_numbers])
    p1x = stamps_data[:, 1]
    p1y = stamps_data[:, 2]
    p2x = stamps_data[:, 3]
    p2y = stamps_data[:, 4]

    # For input ppd's, once we've identified the patches, we just outline them and draw an arrow for d
    # This might mess with original trainset (I dunno), in which case, we should make a copy
    add_outlines(images_data, p1x, p1y, patch_width)
    add_outlines(images_data, p2x, p2y, patch_width)

    ##################################################
    # translating outputs back into things we can plot
    dataset = pp_trainset
    Xout = dataset.X.astype('float32')
    max_stamp = input_width - patch_width
    d_size = (2 * max_stamp + 1)**input_dim
    # displacement
    d_enc = Xout[:, -d_size:]
    d_out_flat = np.argmax(d_enc, axis=1)
    d_shape = [2 * max_stamp + 1, 2 * max_stamp + 1]  # assumed 2D
    d_out = flat_to_2D(d_out_flat, d_shape)
    d_out[to_plot, ]
    # patches
    vc = dataset.view_converter
    p_enc = Xout[:, :len(Xout.T) - d_size]
    p_size = p_enc.shape[1] / 2
    p1_enc = p_enc[:, :p_size]
    p2_enc = p_enc[:, p_size:]
    p1_enc = vc.design_mat_to_topo_view(p1_enc)
    p2_enc = vc.design_mat_to_topo_view(p2_enc)
    pp = dataset.preprocessor
    gcn = pp.items[1]
    means = gcn.means
    normalizers = gcn.normalizers
    toshape = (num_examples, )
    for i in range(input_dim):
        toshape += (1, )
    if num_channels != 1:
        toshape += (1, )
    # When the number of patches and patch-pairs differs, this breaks.
    # I need to match up normalizers/means with their corresponding patches
    # undoing the PCA might be breaking too, but without errors...
    normalizers1 = expand_p1(normalizers)
    normalizers2 = expand_p2(normalizers)
    means1 = expand_p1(means)
    means2 = expand_p2(means)

    p1_enc *= normalizers1.reshape(toshape)
    p1_enc += means1.reshape(toshape)
    p2_enc *= normalizers2.reshape(toshape)
    p2_enc += means2.reshape(toshape)
    # Now, we pull off the same examples from the data to compare to dAE inputs in plots
    outputs = copy.deepcopy(images_data)
    insertpatches(outputs, p1_enc[to_plot], p1x, p1y, patch_width)
    insertpatches(outputs, p2_enc[to_plot], p2x, p2y, patch_width)

    plt.figure()

    for i in range(num2plot):
        # Inputs
        plt.subplot(num2plot, 2, 2 * i + 1)
        plt.imshow(images_data[i], cmap=cm.Greys_r)
        print stamps_data[i]
        a = (stamps_data[i, 2] + patch_width / 2,
             stamps_data[i, 1] + patch_width / 2, stamps_data[i, 6],
             stamps_data[i, 5])
        plt.arrow(a[0], a[1], a[2], a[3], head_width=1.0, head_length=0.6)
        # Outputs
        plt.subplot(num2plot, 2, 2 * i + 2)
        plt.imshow(outputs[i], cmap=cm.Greys_r)
        plt.arrow(a[0],
                  a[1],
                  d_out[to_plot[i], 1],
                  d_out[to_plot[i], 0],
                  head_width=1.0,
                  head_length=0.6)

    plt.show()

    savestr = 'cifar_ppd.png'
    plt.savefig(savestr)