def create_datasets(cls, datasets=None, overwrite=False, img_dir=DATA_DIR, output_dir=DATA_DIR): """Creates the requested datasets, and writes them to disk. """ datasets = datasets or cls.ALL_DATASETS serial.mkdir(output_dir) for dataset_name in list(datasets): file_path_fn = lambda ext: os.path.join( output_dir, '%s.%s' % (dataset_name, ext)) output_files = dict([(ext, file_path_fn(ext)) for ext in ['pkl', 'npy']]) files_missing = np.any( [not os.path.isfile(f) for f in output_files.values()]) if overwrite or np.any(files_missing): print("Loading the %s data" % dataset_name) dataset = cls(which_set=dataset_name, img_dir=img_dir) print("Saving the %s data" % dataset_name) dataset.use_design_loc(output_files['npy']) serial.save(output_files['pkl'], dataset)
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL10-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print("Preparing output directory...") patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_stl10_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2*1000*1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print('Loading CIFAR-100 train dataset...') train = CIFAR100(which_set='train', gcn=55.) print("Preparing output directory...") output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_cifar100_gcn_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() train.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the training data') train.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', train) print("Loading the test data") test = CIFAR100(which_set='test', gcn=55.) print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir + '/test.npy') serial.save(output_dir + '/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def emit_eta_h(method, directory, n, eta_h): directory = directory + '/eta_h_'+str(eta_h) serial.mkdir(directory) if method == 'cg': emit_cg(directory, n, eta_h) else: assert method == 'heuristic' emit_heuristic(directory, n, eta_h)
def emit_eta_h(method, directory, n, eta_h): directory = directory + '/eta_h_' + str(eta_h) serial.mkdir(directory) if method == 'cg': emit_cg(directory, n, eta_h) else: assert method == 'heuristic' emit_heuristic(directory, n, eta_h)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print('Loading CIFAR-100 train dataset...') train = CIFAR100(which_set='train', gcn=55.) print("Preparing output directory...") output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_cifar100_gcn_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() train.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the training data') train.use_design_loc(output_dir+'/train.npy') serial.save(output_dir + '/train.pkl', train) print("Loading the test data") test = CIFAR100(which_set='test', gcn=55.) print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir+'/test.npy') serial.save(output_dir+'/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_cifar100_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.ExtractPatches(patch_shape=(6, 6), num_patches=2 * 1000 * 1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def deal_npy_file(whitenFile_label, whitenFile_feature, txtfile, mode): y = np.load(whitenFile_label) print(y.shape) x = np.load(whitenFile_feature) x = x.reshape((x.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1) print(x.shape) output_dir = "./cifar10_npy/" + mode serial.mkdir(output_dir) file_names = [] for i in range(x.shape[0]): np.save(output_dir + "/" + mode + str(i), x[i]) file_names.append( str(y[i][0]) + "," + output_dir + "/" + mode + str(i) + ".npy" + "\n") open(txtfile, "w").writelines(file_names) print(len(file_names))
def deal_npy_file(whitenFile_label, whitenFile_feature, txtfile, mode): y = np.load(whitenFile_label) print(y.shape) x = np.load(whitenFile_feature) x = x.reshape((x.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1) print(x.shape) output_dir = "./cifar10_images_from_npy/" + mode + "_misc" serial.mkdir(output_dir) file_names = [] for i in range(x.shape[0]): # for i in range(1): name = output_dir + "/" + mode + str(i) + "_misc.png" # plt.imsave(name, x[i]) misc.imsave(name, x[i]) file_names.append(str(y[i][0]) + "," + name + "\n") open(txtfile, "w").writelines(file_names) print(len(file_names))
def create_datasets(cls, datasets=None, overwrite=False, img_dir=DATA_DIR, output_dir=DATA_DIR): """Creates the requested datasets, and writes them to disk. """ datasets = datasets or cls.ALL_DATASETS serial.mkdir(output_dir) for dataset_name in list(datasets): file_path_fn = lambda ext: os.path.join( output_dir, '%s.%s' % (dataset_name, ext)) output_files = dict([(ext, file_path_fn(ext)) for ext in ['pkl', 'npy']]) files_missing = np.any([not os.path.isfile(f) for f in output_files.values()]) if overwrite or np.any(files_missing): print("Loading the %s data" % dataset_name) dataset = cls(which_set=dataset_name, img_dir=img_dir) print("Saving the %s data" % dataset_name) dataset.use_design_loc(output_files['npy']) serial.save(output_files['pkl'], dataset)
params = yaml_parse.load_path('params.yaml') validate = open('validate.yaml', 'r') validate_template = validate.read() validate.close() for expnum, line in enumerate(lines): elems = line.split(' ') assert elems[-1] == '\n' obj = elems[0] if obj == 'P': expdir = '/RQexec/goodfell/experiment_6/%d' % expnum if os.path.exists(expdir): continue try: mkdir(expdir) config = {} for param, value in safe_zip(params, elems[2:-1]): if param['type'] == 'float': value = float(value) elif param['type'] == 'int': value = int(value) else: raise NotImplementedError() if 'postprocess' in param: value = param['postprocess'](value) if 'joint_postprocess' in param: try: value = param['joint_postprocess'](value, config) except Exception, e:
This script also translates the data to lie in [-127.5, 127.5] instead of [0,255]. This makes it play nicer with some of pylearn's visualization tools. """ from pylearn2.datasets.stl10 import STL10 from pylearn2.datasets.preprocessing import Downsample from pylearn2.utils import string_utils as string from pylearn2.utils import serial import numpy as np print 'Preparing output directory...' data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') downsampled_dir = data_dir + '/stl10_32x32' serial.mkdir( downsampled_dir ) README = open(downsampled_dir + '/README','w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. They contain pylearn2 Dataset objects defining the STL-10 dataset, but downsampled to size 32x32 and translated to lie in [-127.5, 127.5 ]. They were created with the pylearn2 script make_downsampled_stl10.py All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)
import os from pylearn2.utils import serial assert len(sys.argv) in [2, 3, 4] train_file = sys.argv[1] if train_file.endswith('.npy'): pieces = train_file.split('.npy') elif train_file.endswith('mat'): pieces = train_file.split('.mat') else: assert False assert len(pieces) == 2 results_dir = pieces[0] serial.mkdir(results_dir) if len(sys.argv) > 3: memreq = sys.argv[3] else: memreq = '12G' command = 'jobdispatch --duree=48:00:00 --whitespace --mem=%(memreq)s /RQusagers/goodfell/cifar100_fold_point_worker ' % locals( ) command += ' "{{' if len(sys.argv) > 2: C_list = sys.argv[2] C_list = [float(C) for C in C_list.split(',')]
if str(val.dtype) == 'bool': val = val.astype('int') params[key] = val assert val.shape == (num_jobs, ) #print key,':',(val.min(),val.mean(),val.max()) ref = { "layer_2_target": 0.0890535860395, "layer_2_irange": 0.0301747773266, "layer_2_init_bias": -0.741101442887, "layer_1_init_bias": -0.397164399345, "balance": 0 } yaml.dump(ref) mkdir(out_dir) for i in xrange(num_jobs): cur_dir = out_dir + '/' + str(i) mkdir(cur_dir) path = cur_dir + '/stage_00_inpaint_params.yaml' obj = dict([(key, params[key][i]) for key in params]) assert all([isinstance(key, str) for key in obj]) assert all([isinstance(val, (int, float)) for val in obj.values()]) # numpy has actually given us subclassed ints/floats that yaml doesn't know how to serialize for key in obj: if isinstance(obj[key], float): obj[key] = float(obj[key]) elif isinstance(obj[key], int):
import sys from pylearn2.utils import serial ignore, model_path, script_dir = sys.argv serial.mkdir(script_dir) chunk_size = 1000 m = 10000 assert m % chunk_size == 0 num_chunks = m / chunk_size assert num_chunks == 10 for i in xrange(num_chunks): start = i * chunk_size stop = (i+1)*chunk_size name = 'chunk_%d.yaml' % i f = open(script_dir + '/' + name, 'w') f.write("""!obj:galatea.pddbm.extract_features.FeatureExtractor { batch_size : 1, model_path : %(model_path)s, pooling_region_counts : [ 3 ], save_paths : [ %(script_dir)s/chunk_%(i)d.npy ], feature_type : "exp_h,exp_g", dataset_family : galatea.pddbm.extract_features.cifar100, which_set : "test", restrict : [ %(start)d, %(stop)d ] }""" % locals() )
if arg == 'public_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_images') outdir = base[:-6] + 'lcn' expected_num_images = 500 elif arg == 'private_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_images') outdir = base[:-6] + 'lcn' expected_num_images = 500 else: usage() print 'Unrecognized argument value:', arg print 'Recognized values are: public_test, private_test' serial.mkdir(outdir) paths = os.listdir(base) if len(paths) != expected_num_images: raise AssertionError("Something is wrong with your " + base \ + "directory. It should contain " + str(expected_num_images) + \ " image files, but contains " + str(len(paths))) kernel_shape = 7 from theano import tensor as T from pylearn2.utils import sharedX from pylearn2.datasets.preprocessing import gaussian_filter from theano.tensor.nnet import conv2d X = T.TensorType(dtype='float32', broadcastable=(True, False, False, True))()
learning_rate = 10. ** rng.uniform(-2., -.5) if rng.randint(2): msat = 2 else: msat = rng.randint(2, 1000) final_momentum = rng.uniform(.5, .9) lr_sat = rng.randint(200, 1000) decay = 10. ** rng.uniform(-3, -1) task_0_yaml_str = task_0_template % locals() serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id)) train_file_full_stem = '{}exp/'.format(EXP_PATH)+str(job_id)+'/' f = open(train_file_full_stem + 'task_0.yaml', 'w') f.write(task_0_yaml_str) f.close() task_1_yaml_str = task_1_template % locals() serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id)) f = open(train_file_full_stem + 'task_1.yaml', 'w') f.write(task_1_yaml_str) f.close()
from pylearn2.datasets.tfd import TFD from pylearn2.utils import string_utils from hossrbm import preproc as my_preproc data_dir = string_utils.preprocess('/data/lisatmp2/desjagui/data') pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.GlobalContrastNormalization(subtract_mean=True)) pipeline.items.append(my_preproc.LeCunLCN((1,48,48))) pipeline.items.append(preprocessing.RemoveMean(axis=0)) pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(14,14), num_patches=5*1000*1000)) #### Build full-sized image dataset. #### print "Preparing output directory for unlabeled patches..." outdir = data_dir + '/tfd_lcn_v1' serial.mkdir(outdir) README = open('README','w') README.write(""" File generated from hossrbm/scripts/tfd/make_tfd_lcn.py. """) README.close() print 'Loading TFD unlabeled dataset...' print "Preprocessing the data..." data = TFD('unlabeled') data.apply_preprocessor(preprocessor = pipeline, can_fit = True) data.use_design_loc(outdir + '/unlabeled_patches.npy') serial.save(outdir + '/unlabeled_patches.pkl',data) #### For supervised dataset, we work on the full-image dataset #### pipeline.items.pop()
h0_bias = sigmoid_bias() h1_bias = sigmoid_bias() learning_rate = 10.**rng.uniform(-2., -.5) if rng.randint(2): msat = 2 else: msat = rng.randint(2, 1000) final_momentum = rng.uniform(.5, .9) lr_sat = rng.randint(200, 1000) decay = 10.**rng.uniform(-3, -1) task_0_yaml_str = task_0_template % locals() serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id)) train_file_full_stem = '{}exp/'.format(EXP_PATH) + str(job_id) + '/' f = open(train_file_full_stem + 'task_0.yaml', 'w') f.write(task_0_yaml_str) f.close() task_1_yaml_str = task_1_template % locals() serial.mkdir('{}exp/'.format(EXP_PATH) + str(job_id)) f = open(train_file_full_stem + 'task_1.yaml', 'w') f.write(task_1_yaml_str) f.close()
import sys from pylearn2.utils import serial ignore, model_path, script_dir = sys.argv serial.mkdir(script_dir) chunk_size = 1000 m = 50000 assert m % chunk_size == 0 num_chunks = m / chunk_size for i in xrange(num_chunks): start = i * chunk_size stop = (i + 1) * chunk_size name = 'chunk_%d.yaml' % i f = open(script_dir + '/' + name, 'w') f.write("""!obj:galatea.pddbm.extract_features.FeatureExtractor { batch_size : 1, model_path : %(model_path)s, pooling_region_counts : [ 3 ], save_paths : [ %(script_dir)s/chunk_%(i)d.npy ], feature_type : "exp_h,exp_g", dataset_family : galatea.pddbm.extract_features.cifar100, which_set : "train", restrict : [ %(start)d, %(stop)d ] }""" % locals()) f.close()
thumbnail_path = image_path.replace(input_path,output_path) thumbnail_path = thumbnail_path.replace('.JPEG','.npy') t1 = time.time() e = os.path.exists(thumbnail_path) t2 = time.time() print t2-t1 if e: continue thumbnail_subdir = '/'.join(thumbnail_path.split('/')[:-1]) if thumbnail_subdir not in created_subdirs: serial.mkdir(thumbnail_subdir) created_subdirs = created_subdirs.union([thumbnail_subdir]) try: t1 = time.time() img = image.load(image_path) t2 = time.time() except Exception, e: print "Encountered a problem: "+str(e) img = None if img is not None: assert len(img.shape) == 3 thumbnail = image.make_letterboxed_thumbnail(img, image_shape) t3 = time.time()
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) print("Preparing output directory...") output_dir = data_dir + '/stl10_32x32_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain a pylearn2 Dataset object defining an unlabeled dataset of a 32x32 approximately whitened version of the STL-10 dataset. unlabeled.pkl contains unlabeled train examples. train.pkl contains labeled train examples. unsupervised.pkl contains the union of these (without any labels). test.pkl contains the labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_stl10_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() data.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the unsupervised data') data.use_design_loc(output_dir+'/unsupervised.npy') serial.save(output_dir + '/unsupervised.pkl', data) X = data.X unlabeled = X[0:100*1000, :] labeled = X[100*1000:, :] del X print("Saving the unlabeled data") data.X = unlabeled data.use_design_loc(output_dir + '/unlabeled.npy') serial.save(output_dir + '/unlabeled.pkl', data) del data del unlabeled print("Saving the labeled train data") supplement.X = labeled supplement.use_design_loc(output_dir+'/train.npy') serial.save(output_dir+'/train.pkl', supplement) del supplement del labeled print("Loading the test data") test = serial.load(downsampled_dir + '/test.pkl') print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir+'/test.npy') serial.save(output_dir+'/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
for key in sorted(params.keys()): val = params[key] if isinstance(val, list): val = np.asarray(val) if str(val.dtype) == 'bool': val = val.astype('int') params[key] = val assert val.shape == (num_jobs, ) #print key,':',(val.min(),val.mean(),val.max()) ref = {"layer_2_target":0.0890535860395, "layer_2_irange":0.0301747773266, "layer_2_init_bias":-0.741101442887, "layer_1_init_bias":-0.397164399345, "balance":0} yaml.dump(ref) mkdir(out_dir) for i in xrange(num_jobs): cur_dir = out_dir +'/'+str(i) mkdir(cur_dir) path = cur_dir + '/stage_00_inpaint_params.yaml' obj = dict([(key, params[key][i]) for key in params]) assert all([isinstance(key, str) for key in obj]) assert all([isinstance(val, (int, float)) for val in obj.values()]) # numpy has actually given us subclassed ints/floats that yaml doesn't know how to serialize for key in obj: if isinstance(obj[key], float): obj[key] = float(obj[key]) elif isinstance(obj[key], int):
thumbnail_path = image_path.replace(input_path, output_path) thumbnail_path = thumbnail_path.replace('.JPEG', '.npy') t1 = time.time() e = os.path.exists(thumbnail_path) t2 = time.time() print t2 - t1 if e: continue thumbnail_subdir = '/'.join(thumbnail_path.split('/')[:-1]) if thumbnail_subdir not in created_subdirs: serial.mkdir(thumbnail_subdir) created_subdirs = created_subdirs.union([thumbnail_subdir]) try: t1 = time.time() img = image.load(image_path) t2 = time.time() except Exception, e: print "Encountered a problem: " + str(e) img = None if img is not None: assert len(img.shape) == 3 thumbnail = image.make_letterboxed_thumbnail(img, image_shape) t3 = time.time()
learning_rate = 10. ** rng.uniform(-2., -.5) if rng.randint(2): msat = 2 else: msat = rng.randint(2, 1000) final_momentum = rng.uniform(.5, .9) lr_sat = rng.randint(200, 1000) decay = 10. ** rng.uniform(-3, -1) task_0_yaml_str = task_0_template % locals() serial.mkdir('exp/' + str(job_id)) train_file_full_stem = 'exp/'+str(job_id)+'/' f = open(train_file_full_stem + 'task_0.yaml', 'w') f.write(task_0_yaml_str) f.close() task_1_yaml_str = task_1_template % locals() serial.mkdir('exp/' + str(job_id)) f = open(train_file_full_stem + 'task_1.yaml', 'w') f.write(task_1_yaml_str) f.close()
from pylearn2.utils import serial from pylearn2.utils import string_utils import numpy import argparse from hossrbm.scripts.conv_pipeline import cssrbm_feature_extractor as featext print "Preparing output directory..." data_dir = string_utils.preprocess('/data/lisatmp2/desjagui/data') indir = data_dir + '/tfd_cn' outdir = data_dir + '/tfd_cn_layer2' serial.mkdir(outdir) parser = argparse.ArgumentParser() parser.add_argument('--model', help='Path of model .pkl file.') args = parser.parse_args() """ print 'Processing unlabeled set...' in_dset_fname = '%s/%s.pkl' % (indir, 'unlabeled') out_dset_fname = '%s/%s.pkl' % (outdir, 'unlabeled') featext.run(args.model, in_dset_fname, batch_size = 128, image_width = 48, patch_width = 14, pool_width = 12, output_width = 9216, output_file = out_dset_fname) """
import os from pylearn2.utils import serial assert len(sys.argv) in [2,3,4] train_file = sys.argv[1] if train_file.endswith('.npy'): pieces = train_file.split('.npy') elif train_file.endswith('mat'): pieces = train_file.split('.mat') else: assert False assert len(pieces) == 2 results_dir = pieces[0] serial.mkdir(results_dir) if len(sys.argv) > 3: assert False memreq = sys.argv[3] else: memreq = '15000M' command = 'jobdispatch --duree=48:00:00 --whitespace --mem=%(memreq)s /data/lisatmp2/goodfeli/cifar10_fold_point_worker ' % locals() command += ' "{{' if len(sys.argv) > 2: C_list = sys.argv[2] C_list = [ float(C) for C in C_list.split(',') ]
""" from __future__ import print_function from theano.compat.six.moves import xrange from pylearn2.datasets.stl10 import STL10 from pylearn2.datasets.preprocessing import Downsample from pylearn2.utils import string_utils as string from pylearn2.utils import serial import numpy as np print('Preparing output directory...') data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') downsampled_dir = data_dir + '/stl10_32x32' serial.mkdir(downsampled_dir) README = open(downsampled_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. They contain pylearn2 Dataset objects defining the STL-10 dataset, but downsampled to size 32x32 and translated to lie in [-127.5, 127.5 ]. They were created with the pylearn2 script make_downsampled_stl10.py All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) print("Preparing output directory...") output_dir = data_dir + '/stl10_32x32_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain a pylearn2 Dataset object defining an unlabeled dataset of a 32x32 approximately whitened version of the STL-10 dataset. unlabeled.pkl contains unlabeled train examples. train.pkl contains labeled train examples. unsupervised.pkl contains the union of these (without any labels). test.pkl contains the labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_stl10_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() data.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the unsupervised data') data.use_design_loc(output_dir + '/unsupervised.npy') serial.save(output_dir + '/unsupervised.pkl', data) X = data.X unlabeled = X[0:100 * 1000, :] labeled = X[100 * 1000:, :] del X print("Saving the unlabeled data") data.X = unlabeled data.use_design_loc(output_dir + '/unlabeled.npy') serial.save(output_dir + '/unlabeled.pkl', data) del data del unlabeled print("Saving the labeled train data") supplement.X = labeled supplement.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', supplement) del supplement del labeled print("Loading the test data") test = serial.load(downsampled_dir + '/test.pkl') print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir + '/test.npy') serial.save(output_dir + '/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print 'Loading STL10-10 unlabeled and train datasets...' downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print 'Concatenating datasets...' data.set_design_matrix(np.concatenate((data.X,supplement.X),axis=0)) del supplement print "Preparing output directory..." patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir( patch_dir ) README = open(patch_dir + '/README','w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the
for output, count in zip(outputs, pooling_region_counts): output[i:i + batch_size, ...] = average_pool(count) t6 = time.time() print(t6 - t1, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5) return outputs[0] if __name__ == '__main__': assert len(sys.argv) == 3 ipath = sys.argv[1] opath = sys.argv[2] serial.mkdir(opath) model = serial.load('/data/lisatmp/goodfeli/darpa_s3c.pkl') preprocessor = serial.load( '/data/lisatmp/goodfeli/darpa_imagenet_patch_6x6_train_preprocessor.pkl' ) patchifier = ExtractGridPatches(patch_shape=(size, size), patch_stride=(1, 1)) preprocessor.items.insert(0, patchifier) extractor = FeatureExtractor(model=model, preprocessor=preprocessor) contents = os.listdir(ipath) for i, fname in enumerate(contents): print str(i + 1) + '/' + str(len(contents))
h0_bias = sigmoid_bias() h1_bias = sigmoid_bias() learning_rate = 10.**rng.uniform(-2., -.5) if rng.randint(2): msat = 2 else: msat = rng.randint(2, 1000) final_momentum = rng.uniform(.5, .9) lr_sat = rng.randint(200, 1000) decay = 10.**rng.uniform(-3, -1) task_0_yaml_str = task_0_template % locals() serial.mkdir('exp/' + str(job_id)) train_file_full_stem = 'exp/' + str(job_id) + '/' f = open(train_file_full_stem + 'task_0.yaml', 'w') f.write(task_0_yaml_str) f.close() task_1_yaml_str = task_1_template % locals() serial.mkdir('exp/' + str(job_id)) f = open(train_file_full_stem + 'task_1.yaml', 'w') f.write(task_1_yaml_str) f.close()
dataset_str = { 'stlfull': '${STL10_WHITENED_UNSUP}', 'stlpatch': '${STL10_PATCHES_6x6}', 'cifarfull': '${CIFAR10_WHITENED_TRAIN}', 'cifarpatch': '${CIFAR10_PATCHES_6x6}' }[dataset + kind] for size in ['small', 'med', 'big']: N = {'small': 625, 'med': 1600, 'big': 4000}[size] directory = 'models/%s/%s/%s' % (dataset, kind, size) path = '%s/random_patches.yaml' % (directory) serial.mkdir(directory) f = open(path, 'w') f.write(""" !obj:pylearn2.scripts.train.Train { "dataset": !pkl: &src "%s", "model": !obj:galatea.s3c.s3c.S3C { "nvis" : 108, "nhid" : %d, "init_bias_hid" : -4., "max_bias_hid" : 0., "min_bias_hid" : -7., "irange" : .02, "constrain_W_norm" : 1, "init_B" : 3.,
""" from __future__ import print_function from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.datasets.cifar100 import CIFAR100 from pylearn2.utils import string data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') print('Loading CIFAR-100 train dataset...') data = CIFAR100(which_set='train') print("Preparing output directory...") patch_dir = data_dir + '/cifar100/cifar100_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 8x8 approximately whitened, contrast-normalized patches drawn uniformly at random from the CIFAR-100 train set. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been
for kind in [ 'full', 'patch' ]: dataset_str = { 'stlfull' : '${STL10_WHITENED_UNSUP}', 'stlpatch' : '${STL10_PATCHES_6x6}', 'cifarfull' : '${CIFAR10_WHITENED_TRAIN}', 'cifarpatch' : '${CIFAR10_PATCHES_6x6}' }[dataset+kind] for size in [ 'small', 'med', 'big' ]: N = { 'small' : 625, 'med' : 1600, 'big' : 4000 }[size] directory = 'models/%s/%s/%s' % (dataset, kind, size) path = '%s/random_patches.yaml' % (directory) serial.mkdir(directory) f = open(path,'w') f.write(""" !obj:pylearn2.scripts.train.Train { "dataset": !pkl: &src "%s", "model": !obj:galatea.s3c.s3c.S3C { "nvis" : 108, "nhid" : %d, "init_bias_hid" : -4., "max_bias_hid" : 0., "min_bias_hid" : -7., "irange" : .02, "constrain_W_norm" : 1, "init_B" : 3.,
""" from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils from pylearn2.datasets.cifar100 import CIFAR100 data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print 'Loading CIFAR-100 train dataset...' train = CIFAR100(which_set = 'train', gcn = 55.) print "Preparing output directory..." output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir( output_dir ) README = open(output_dir + '/README','w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images.
#average pooling for output, count in zip(outputs, pooling_region_counts): output[i:i+batch_size,...] = average_pool(count) t6 = time.time() print (t6-t1, t2-t1, t3-t2, t4-t3, t5-t4, t6-t5) return outputs[0] if __name__ == '__main__': assert len(sys.argv) == 3 ipath = sys.argv[1] opath = sys.argv[2] serial.mkdir(opath) model = serial.load('/data/lisatmp/goodfeli/darpa_s3c.pkl') preprocessor = serial.load('/data/lisatmp/goodfeli/darpa_imagenet_patch_6x6_train_preprocessor.pkl') patchifier = ExtractGridPatches( patch_shape = (size,size), patch_stride = (1,1) ) preprocessor.items.insert(0,patchifier) extractor = FeatureExtractor( model = model, preprocessor = preprocessor) contents = os.listdir(ipath) for i, fname in enumerate(contents): print str(i+1)+'/'+str(len(contents)) X = np.load(ipath+'/'+fname) X = extractor(X) np.save(opath+'/'+fname,X)