예제 #1
0
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL10-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))
    del supplement

    print("Preparing output directory...")
    patch_dir = data_dir + '/stl10_patches_8x8'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from a downsampled (to 32x32)
    version of the STL-10 train and unlabeled datasets.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_stl10_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8),
                          num_patches=2*1000*1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
예제 #2
0
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')

    print('Loading CIFAR-100 train dataset...')
    data = CIFAR100(which_set='train')

    print("Preparing output directory...")
    patch_dir = data_dir + '/cifar100/cifar100_patches'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from the CIFAR-100 train set.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_cifar100_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(6, 6),
                                     num_patches=2 * 1000 * 1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
예제 #3
0
 def get_preprocess_gcn(self, preprocess_id):
     row = self.db.executeSQL(
         """
     SELECT subtract_mean, std_bias, use_norm
     FROM hps3.preprocess_gcn
     WHERE preprocess_id = %s
     """, (preprocess_id, ), self.db.FETCH_ONE)
     if not row or row is None:
         raise HPSData("No gcn preprocess for preprocess_id="\
             +str(preprocess_id))
     (subtract_mean, std_bias, use_norm) = row
     return pp.GlobalContrastNormalization(subtract_mean=subtract_mean,
                                           std_bias=std_bias,
                                           use_norm=use_norm)
예제 #4
0
def get_pipeline(img_shape, patch_size, batch_size):
    pipeline = preprocessing.Pipeline()
    conf = get_config()
    if conf['preprocessing']['remove_mean']:
        pipeline.items.append(preprocessing.RemoveMean())
    if conf['preprocessing']['gcn']:
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(batch_size=batch_size)
        )
    if conf['preprocessing']['lcn']:
        # LCN requires uneven patch size
        lcn_patch_size = patch_size + 1 - (patch_size % 2)
        pipeline.items.append(
            preprocessing.LeCunLCN(
                img_shape, kernel_size=lcn_patch_size)
        )
    return pipeline
예제 #5
0
def main():
    train = cifar10.CIFAR10(which_set="train", center=True)

    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                                  sqrt_bias=0.0,
                                                  use_std=True))
    pipeline.items.append(preprocessing.PCA(num_components=512))

    test = cifar10.CIFAR10(which_set="test")

    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
    test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

    serial.save('cifar10_preprocessed_train.pkl', train)
    serial.save('cifar10_preprocessed_test.pkl', test)
예제 #6
0
파일: run.py 프로젝트: capybaralet/current
def get_processed_dataset():

    train_path = 'pp_cifar10_train.pkl'
    test_path = 'pp_cifar10_test.pkl'

    if os.path.exists(train_path) and os.path.exists(
            test_path) and not new_params:
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)

    else:
        print 'loading raw data...'

        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.ExtractPatchesWithPosition(
                patch_shape=patch_shape, patches_per_image=patches_per_image))
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                      use_std=True))
        pipeline.items.append(
            preprocessing.PCA(num_components=num_components,
                              keep_var_fraction=keep_var_fraction))
        pipeline.items.append(
            preprocessing.ExtractPatchPairs(
                patches_per_image=patches_per_image,
                num_images=train_size,
                input_width=input_width))

        trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop)
        testset = cifar10.CIFAR10(which_set="test")

        trainset.preprocessor = pipeline

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)

        # the pkl-ing is having issues, the dataset is maybe too big.
        serial.save(train_path, trainset)
        serial.save(test_path, testset)

        # this path will be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    return trainset, testset
def get_dataset_cifar10():
    """
    The orginal pipeline on cifar10 from pylearn2. Please refer to
    pylearn2/scripts/train_example/make_dataset.py for details.
    """

    train_path = 'cifar10_preprocessed_train.pkl'
    test_path = 'cifar10_preprocessed_test.pkl'

    if os.path.exists(train_path) and \
            os.path.exists(test_path):
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)
    else:
        print 'loading raw data...'
        trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train")
        testset =  cifar10.CIFAR10(which_set="test")

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        pipeline.items.append(
            preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000))

        pipeline.items.append(preprocessing.GlobalContrastNormalization())

        pipeline.items.append(preprocessing.ZCA())

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        trainset.use_design_loc('train_design.npy')

        testset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.use_design_loc('test_design.npy')

        print 'saving preprocessed data...'
        serial.save('cifar10_preprocessed_train.pkl', trainset)
        serial.save('cifar10_preprocessed_test.pkl', testset)

        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    # this path will be used for visualizing weights after training is done
    #global YAML
    return trainset, testset
예제 #8
0
파일: gwdata.py 프로젝트: gaoch023/kaggle
def generate_patches():
    datasets = OrderedDict()
    datasets['train'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=1,
                                                  stop=201)
    datasets['valid'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=201,
                                                  stop=283)
    datasets['test'] = GenderWrite.gwdata.GWData(which_set='test')
    datasets['tottrain'] = GenderWrite.gwdata.GWData(which_set='train')

    # preprocess patches
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.GlobalContrastNormalization())
    pipeline.items.append(preprocessing.ZCA())
    for dstr, dset in datasets.iteritems():
        print dstr
        # only fit on train data
        trainbool = dstr == 'train' or dstr == 'tottrain'
        dset.apply_preprocessor(preprocessor=pipeline, can_fit=trainbool)
        # save
        dset.use_design_loc(DATA_DIR + dstr + '_design.npy')
        serial.save(DATA_DIR + 'gw_preprocessed_' + dstr + '.pkl', dset)
예제 #9
0
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train")

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000))
pipeline.items.append(preprocessing.GlobalContrastNormalization())
pipeline.items.append(preprocessing.ZCA())

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

train.use_design_loc(
    '/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M_design.npy')
test.use_design_loc(
    '/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M_design.npy')

serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_train_2M.pkl', train)
serial.save('/data/lisatmp/goodfeli/cifar10_preprocessed_test_2M.pkl', test)
예제 #10
0
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train", center=True)

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                              std_bias=0.0))
pipeline.items.append(preprocessing.PCA(num_components=512))

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

serial.save('cifar10_preprocessed_train.pkl', train)
serial.save('cifar10_preprocessed_test.pkl', test)
예제 #11
0
def generate(opc):
    """
    Summary (Generates a dataset with the chosen transformation).

    Parameters
    ----------
    opc: string
        Only two options, shifts or rotations.
    """
    dim = 19  # outer square
    # A bigger image is used to avoid empty pixels in the
    # borders.
    reg = 13  # inner square
    total = 20000  # Number of training examples

    im1 = numpy.zeros((total, reg, reg, 1), dtype='float32')
    im2 = numpy.zeros((total, reg, reg, 1), dtype='float32')
    Y = numpy.zeros((total, 1), dtype='uint8')
    rng = make_np_rng(9001, [1, 2, 3], which_method="uniform")
    transformation = opc

    if transformation == 'shifts':
        # Shifts
        # only shifts between [-3, +3] pixels
        shifts = list(itertools.product(range(-3, 4), range(-3, 4)))
        t = 0
        while t < total:
            x = rng.uniform(0, 1, (dim, dim))
            x = numpy.ceil(x * 255)
            im_x = x[3:16, 3:16][:, :, None]
            ind = rng.randint(0, len(shifts))
            Y[t] = ind
            txy = shifts[ind]
            tx, ty = txy
            im_y = x[(3 + tx):(16 + tx), (3 + ty):(16 + ty)][:, :, None]
            im1[t, :] = im_x
            im2[t, :] = im_y
            t += 1
    else:
        assert transformation == 'rotations'
        # Rotations
        import Image
        # import cv2
        angs = numpy.linspace(0, 359, 90)
        t = 0
        while t < total:
            x = rng.uniform(0, 1, (dim, dim))
            x = numpy.ceil(x * 255)
            im_x = x[3:16, 3:16][:, :, None]
            ind = rng.randint(0, len(angs))
            Y[t] = ind
            ang = angs[ind]
            y = numpy.asarray(Image.fromarray(x).rotate(ang))
            # scale = 1
            # M1 = cv2.getRotationMatrix2D((dim/2, dim/2), ang, scale)
            # y = cv2.warpAffine(x, M1, (dim, dim))
            im_y = y[3:16, 3:16][:, :, None]
            im1[t, :] = im_x
            im2[t, :] = im_y
            t += 1

    view_converter = dense_design_matrix.DefaultViewConverter((reg, reg, 1))

    design_X = view_converter.topo_view_to_design_mat(im1)
    design_Y = view_converter.topo_view_to_design_mat(im2)

    # Normalize data:
    pipeline = preprocessing.Pipeline()
    gcn = preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                    use_std=True)
    pipeline.items.append(gcn)
    XY = numpy.concatenate((design_X, design_Y), 0)
    XY_ImP = dense_design_matrix.DenseDesignMatrix(X=XY)
    XY_ImP.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    X1 = XY_ImP.X[0:design_X.shape[0], :]
    X2 = XY_ImP.X[design_X.shape[0]:, :]

    # As a Conv2DSpace
    topo_X1 = view_converter.design_mat_to_topo_view(X1)
    topo_X2 = view_converter.design_mat_to_topo_view(X2)
    axes = ('b', 0, 1, 'c')
    data_specs = (CompositeSpace([
        Conv2DSpace((reg, reg), num_channels=1, axes=axes),
        Conv2DSpace((reg, reg), num_channels=1, axes=axes),
        VectorSpace(1)
    ]), ('featuresX', 'featuresY', 'targets'))
    train = VectorSpacesDataset((topo_X1, topo_X2, Y), data_specs=data_specs)

    # As a VectorSpace
    # data_specs = (CompositeSpace(
    # [VectorSpace(reg * reg),
    # VectorSpace(reg * reg),
    #      VectorSpace(1)]),
    #               ('featuresX', 'featuresY', 'targets'))
    # train = VectorSpacesDataset(data=(X1, X2, Y), data_specs=data_specs)

    import os

    save_path = os.path.dirname(os.path.realpath(__file__))
    serial.save(os.path.join(save_path, 'train_preprocessed.pkl'), train)
    if str(data.X.dtype) != config.floatX:
        logging.warning("The dataset is saved as {}, changing theano's floatX " \
                        "to the same dtype".format(data.X.dtype))
        config.floatX = str(data.X.dtype)


# Load train data
train = SVHN('splitted_train', path=local_path)
check_dtype(train)

# prepare preprocessing
pipeline = preprocessing.Pipeline()
# without batch_size there is a high chance that you might encounter memory error
# or pytables crashes
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(batch_size=5000))
pipeline.items.append(preprocessing.LeCunLCN((32, 32)))

# apply the preprocessings to train
train.apply_preprocessor(pipeline, can_fit=True)
del train

# load and preprocess valid
valid = SVHN('valid', path=local_path)
check_dtype(valid)
valid.apply_preprocessor(pipeline, can_fit=False)

# load and preprocess test
test = SVHN('test', path=local_path)
check_dtype(test)
test.apply_preprocessor(pipeline, can_fit=False)
예제 #13
0
def get_dataset(tot=False, preprocessor='normal'):
    if not os.path.exists(DATA_DIR+'train.npy') or \
        not os.path.exists(DATA_DIR+'test.npy') or \
        not os.path.exists(DATA_DIR+'targets.npy'):
        initial_read()
    
    train_path = DATA_DIR+'train_'+preprocessor+'_preprocessed.pkl'
    valid_path = DATA_DIR+'valid_'+preprocessor+'_preprocessed.pkl'
    tottrain_path = DATA_DIR+'tottrain_'+preprocessor+'_preprocessed.pkl'
    test_path = DATA_DIR+'test_'+preprocessor+'_preprocessed.pkl'
    
    if os.path.exists(train_path) and os.path.exists(valid_path) and os.path.exists(test_path):
        
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:
        
        print 'loading raw data...'
        trainset = Digits(which_set='train', start=0, stop=34000)
        validset = Digits(which_set='train', start=34000, stop=42000)
        tottrainset = Digits(which_set='train')
        testset = Digits(which_set='test')
        
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
        
        if preprocessor != 'nozca':
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        
        # note the can_fit=False's: no sharing between train and valid data
        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        
        if preprocessor not in ('normal','nozca'):
            for data in (trainset, validset, tottrainset, testset):
                for ii in range(data.X.shape[0]):
                    # normalize to [0,1]
                    dmax = np.max(data.X[ii,:])
                    dmin = np.min(data.X[ii,:])
                    dnorm = (data.X[ii,:] - dmin) / (dmax - dmin)
                    # and convert to PIL image
                    img = Image.fromarray(dnorm.reshape(28, 28) * 255.).convert('L')
                    
                    # apply preprocessor
                    if preprocessor == 'rotate':
                        rot = rng.randint(-40, 41)
                        img = img.rotate(rot, Image.BILINEAR)
                    elif preprocessor == 'emboss':
                        img = emboss(img)
                    elif preprocessor == 'hshear':
                        # coef = 0 means unsheared
                        coef = -1 + np.random.rand()*2
                        # note: image is moved with (coef/2)*28 to center it after shearing
                        img = img.transform((28,28), Image.AFFINE, (1,coef,-(coef/2)*28,0,1,0), Image.BILINEAR)
                    elif preprocessor == 'vshear':
                        coef = -1 + np.random.rand()*2
                        img = img.transform((28,28), Image.AFFINE, (1,0,0,coef,1,-(coef/2)*28), Image.BILINEAR)
                    elif preprocessor == 'patch':
                        # negative values are not possible in PIL, so do a zoom only transform then
                        x1 = np.random.randint(0, 5)
                        y1 = np.random.randint(0, 5)
                        x2 = np.random.randint(0, 5)
                        y2 = np.random.randint(0, 5)
                        img = img.transform((28,28), Image.EXTENT, (x1, y1, 28-x2, 28-y2), Image.BILINEAR)
                    
                    # convert back to numpy array
                    data.X[ii,:] = np.array(img.getdata()) / 255.
                    
                    if preprocessor == 'noisy':
                        # add noise
                        data.X[ii,:] += np.random.randn(28*28) * 0.1
                        # bound between [0,1]
                        data.X[ii,:] = np.minimum(np.ones(28*28), np.maximum(np.zeros(28*28), data.X[ii,:]))
        
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR+'train_'+preprocessor+'_design.npy')
        validset.use_design_loc(DATA_DIR+'valid_'+preprocessor+'_design.npy')
        tottrainset.use_design_loc(DATA_DIR+'tottrain_'+preprocessor+'_design.npy')
        testset.use_design_loc(DATA_DIR+'test_'+preprocessor+'_design.npy')
        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path
        
        print 'saving preprocessed data...'
        serial.save(train_path, trainset)
        serial.save(valid_path, validset)
        serial.save(tottrain_path, tottrainset)
        serial.save(test_path, testset)
        
    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset
예제 #14
0
def test_works():
    load = True
    if load == False:
        ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000)
        ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049)
        # valid can_fit = false
        pipeline = preprocessing.Pipeline()
        stndrdz = preprocessing.Standardize()
        stndrdz.apply(ddmTrain, can_fit=True)
        #doubt, how about can_fit = False?
        stndrdz.apply(ddmValid, can_fit=False)
        GCN = preprocessing.GlobalContrastNormalization()
        GCN.apply(ddmTrain, can_fit=True)
        GCN.apply(ddmValid, can_fit=False)

        pcklFile = open('kpd.pkl', 'wb')
        obj = (ddmTrain, ddmValid)
        pickle.dump(obj, pcklFile)
        pcklFile.close()
        return
    else:
        pcklFile = open('kpd.pkl', 'rb')
        (ddmTrain, ddmValid) = pickle.load(pcklFile)
        pcklFile.close()

    #creating layers
    #2 convolutional rectified layers, border mode valid
    layer1 = ConvRectifiedLinear(layer_name='convRect1',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)
    layer2 = ConvRectifiedLinear(layer_name='convRect2',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)

    # Rectified linear units
    layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3')

    #multisoftmax
    n_groups = 30
    n_classes = 98
    irange = 0
    layer_name = 'multisoftmax'
    layerMS = MultiSoftmax(n_groups=n_groups,
                           irange=0.05,
                           n_classes=n_classes,
                           layer_name=layer_name)

    #setting up MLP
    MLPerc = MLP(batch_size=8,
                 input_space=Conv2DSpace(shape=[96, 96], num_channels=1),
                 layers=[layer1, layer2, layer3, layerMS])

    #mlp_cost
    missing_target_value = -1
    mlp_cost = MLPCost(cost_type='default',
                       missing_target_value=missing_target_value)

    #algorithm

    # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria

    term_crit = MonitorBased(prop_decrease=0.00001,
                             N=30,
                             channel_name='validation_objective')
    kpSGD = KeypointSGD(learning_rate=0.001,
                        init_momentum=0.5,
                        monitoring_dataset={
                            'validation': ddmValid,
                            'training': ddmTrain
                        },
                        batch_size=8,
                        batches_per_iter=750,
                        termination_criterion=term_crit,
                        train_iteration_mode='random_uniform',
                        cost=mlp_cost)

    #train extension
    train_ext = ExponentialDecayOverEpoch(decay_factor=0.998,
                                          min_lr_scale=0.01)
    #train object
    train = Train(dataset=ddmTrain,
                  save_path='kpd_model2.pkl',
                  save_freq=1,
                  model=MLPerc,
                  algorithm=kpSGD,
                  extensions=[
                      train_ext,
                      MonitorBasedSaveBest(channel_name='validation_objective',
                                           save_path='kpd_best.pkl'),
                      MomentumAdjustor(start=1, saturate=20, final_momentum=.9)
                  ])
    train.main_loop()
    train.save()
예제 #15
0
    # We'd like to do several operations on them, so we'll set up a pipeline to
    # do so.
    pipeline = preprocessing.Pipeline()

    # First we want to pull out small patches of the images, since it's easier
    # to train an RBM on these
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000)
    )

    # Next we contrast normalize the patches. The default arguments use the
    # same "regularization" parameters as those used in Adam Coates, Honglak
    # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in
    # Unsupervised Feature Learning"
    pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))

    # Finally we whiten the data using ZCA. Again, the default parameters to
    # ZCA are set to the same values as those used in the previously mentioned
    # paper.
    pipeline.items.append(preprocessing.ZCA())

    # Here we apply the preprocessing pipeline to the dataset. The can_fit
    # argument indicates that data-driven preprocessing steps (such as the ZCA
    # step in this example) are allowed to fit themselves to this dataset.
    # Later we might want to run the same pipeline on the test set with the
    # can_fit flag set to False, in order to make sure that the same whitening
    # matrix was used on both datasets.
    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    # Finally we save the dataset to the filesystem. We instruct the dataset to
예제 #16
0
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train", center=True)

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                              sqrt_bias=0.0,
                                              use_std=True))
pipeline.items.append(preprocessing.PCA(num_components=512))

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

serial.save('cifar10_preprocessed_train.pkl', train)
serial.save('cifar10_preprocessed_test.pkl', test)
예제 #17
0
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train")

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2000000))
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(std_bias=0.0, use_norm=1.))

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

train.use_design_loc(
    '/data/lisatmp/goodfeli/cifar10_sphere_train_2M_design.npy')
test.use_design_loc('/data/lisatmp/goodfeli/cifar10_sphere_test_2M_design.npy')

serial.save('/data/lisatmp/goodfeli/cifar10_sphere_train_2M.pkl', train)
serial.save('/data/lisatmp/goodfeli/cifar10_sphere_test_2M.pkl', test)

train = serial.load('/data/lisatmp/goodfeli/cifar10_sphere_train_2M.pkl')
to extract the patches and approximately whiten / contrast normalize
them. This object is necessary when extracting features for
supervised learning or test set classification, because the
extracted features must be computed using inputs that have been
whitened with the ZCA matrix learned and stored by this Pipeline.

They were created with the pylearn2 script make_cifar100_patches.py.

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")

README.close()

print("Preprocessing the data...")
pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8),
                                 num_patches=2 * 1000 * 1000))
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
pipeline.items.append(preprocessing.ZCA())
data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

data.use_design_loc(patch_dir + '/data.npy')

serial.save(patch_dir + '/data.pkl', data)

serial.save(patch_dir + '/preprocessor.pkl', pipeline)
예제 #19
0
def get_data(tot=True, flatgrey=False):
    tottrain_path = DATA_DIR+'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl'
    test_path = DATA_DIR+'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl'

    if os.path.exists(test_path):

        print 'loading preprocessed data'
        datasets = OrderedDict()
        # datasets['train'] = serial.load(train_path)
        # datasets['valid'] = serial.load(valid_path)
        if tot:
            datasets['tottrain'] = serial.load(tottrain_path)
        datasets['test'] = serial.load(test_path)
        if tot:
            return datasets['tottrain'], datasets['test']
        else:
            return datasets['train'], datasets['test']
    else:
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(use_std=True))
        pipeline.items.append(preprocessing.ZCA())

        # print 'traindata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=0, stop=39999)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'train_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_train'+str(SUBMODEL) + '.pkl', data)

        # print 'validdata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=40000, stop=61577)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'valid_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_valid'+str(SUBMODEL) + '.pkl', data)

        print 'tottraindata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='training', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'tottrain_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl', data)

        print 'testdata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='test', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'test_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl', data)

        print 'Finished, now re-run for running model on GPU'
        return None, None