def test_preprocess(): """ Tests that `preprocess` fills in environment variables using various interfaces and raises a ValueError if a needed environment variable definition is missing. """ try: keys = ["PYLEARN2_" + str(uuid.uuid1())[:8] for _ in xrange(3)] strs = ["${%s}" % k for k in keys] os.environ[keys[0]] = keys[1] # Test with os.environ only. assert preprocess(strs[0]) == keys[1] # Test with provided dict only. assert preprocess(strs[1], environ={keys[1]: keys[2]}) == keys[2] # Provided overrides os.environ. assert preprocess(strs[0], environ={keys[0]: keys[2]}) == keys[2] raised = False try: preprocess(strs[2], environ={keys[1]: keys[0]}) except ValueError: raised = True assert raised finally: for key in keys: if key in os.environ: del os.environ[key]
def get_key(config_file = '${HOME}/.key_chain'): """ read and returns auth key from config file """ config_file = preprocess(config_file) f = open(config_file) config = ConfigParser.RawConfigParser() config.read(preprocess(config_file)) return config.get('mashape', 'key')
def __init__(self): default_path = "${PYLEARN2_DATA_PATH}" local_path = "${PYLEARN2_LOCAL_DATA_PATH}" self.pid = os.getpid() try: self.dataset_remote_dir = string_utils.preprocess(default_path) self.dataset_local_dir = string_utils.preprocess(local_path) except (ValueError, string_utils.NoDataPathError, string_utils.EnvironmentVariableError): # Local cache seems to be deactivated self.dataset_remote_dir = "" self.dataset_local_dir = ""
def __init__(self, whichset, path=None): # here, final refers to the unlabled images from which # we should make predictions (images_test_rev1) # the train/test/valid sets come from images_training_rev1 # bigtrain is just the whole unsplit images_traininng_rev1 assert whichset in ['train','test','valid','final','bigtrain'] self.whichset = whichset # this is the final desired shape # the original shape is 424, 424 self.img_shape = (100,100,3) self.target_shape = (37,) if path is None: path = '${PYLEARN2_DATA_PATH}/galaxy-data/' # load data path = preprocess(path) file_n = "{}_arrays.h5".format(os.path.join(path, "h5", whichset)) if os.path.isfile(file_n): # just open file self.h5file = tables.openFile(file_n, mode='r') else: # create file and fill with data self.first_time(whichset, path, file_n) #axes=('b', 0, 1, 'c') # not sure what this means #view_converter = DefaultViewConverter((100, 100, 3), axes) super(galaxy_zoo_dataset, self).__init__(X=root.images, y=root.targets, axes=axes)
def __init__(self, path='../filtered-seizure-data', # base directory, location of directories of filtered hkl files target='Dog_1', # target is added bot to the path and as a prefix to each file name one_hot=False, scale_option='usf', nwindows=60, skip=5, window_size=None, expect_labels = True): """ .. todo:: WRITEME """ self.path = path self.target = target self.one_hot = one_hot self.scale_option = scale_option self.nwindows = nwindows self.expect_labels = expect_labels self.skip = skip self.view_converter = None self.Nsamples = 239766 # 10 min at 399.61 Hz if window_size is None: self.window_size = self.Nsamples // self.nwindows else: self.window_size = window_size # and go self.path = preprocess(self.path) X, y = self._load_data() super(MyPyLearn2Dataset, self).__init__(X=X, y=y)
def load_ndarray_label(name): """ Load the train,valid,test label data for the dataset `name` and return it in ndarray format. This is only available for the toy dataset ule. Parameters ---------- name : 'ule' Must be 'ule' Returns ------- train_l. valid_l, test_l : ndarray Label data loaded """ assert name in ['ule'] common_path = os.path.join( preprocess('${PYLEARN2_DATA_PATH}'), 'UTLC', 'filetensor', name + '_') trname, vname, tename = [common_path + subset + '.tf' for subset in ['trainl', 'validl', 'testl']] trainl = load_filetensor(trname) validl = load_filetensor(vname) testl = load_filetensor(tename) return trainl, validl, testl
def Transform(): """Test smaller version of convolutional_network.ipynb""" which_experiment = "S100" skip.skip_if_no_data() yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) data_dir = string_utils.preprocess("${PYLEARN2_DATA_PATH}") save_path = os.path.join(data_dir, "cifar10", "experiment_" + string.lower(which_experiment)) base_save_path = os.path.join(data_dir, "cifar10") # Escape potential backslashes in Windows filenames, since # they will be processed when the YAML parser will read it # as a string # save_path.replace('\\', r'\\') yaml = open("{0}/experiment_base_transform.yaml".format(yaml_file_path), "r").read() hyper_params = { "batch_size": 64, "output_channels_h1": 64, "output_channels_h2": 128, "output_channels_h3": 600, "max_epochs": 100, "save_path": save_path, "base_save_path": base_save_path, } yaml = yaml % (hyper_params) train = yaml_parse.load(yaml) train.main_loop()
def load(filepath, recurse_depth=0, retry=True): """ Parameters ---------- filepath : str A path to a file to load. Should be a pickle, Matlab, or NumPy file. recurse_depth : int End users should not use this argument. It is used by the function itself to implement the `retry` option recursively. retry : bool If True, will make a handful of attempts to load the file before giving up. This can be useful if you are for example calling show_weights.py on a file that is actively being written to by a training script--sometimes the load attempt might fail if the training script writes at the same time show_weights tries to read, but if you try again after a few seconds you should be able to open the file. Returns ------- loaded_object : object The object that was stored in the file. ..todo Refactor to hide recurse_depth from end users """ try: import joblib joblib_available = True except ImportError: joblib_available = False if recurse_depth == 0: filepath = preprocess(filepath) if filepath.endswith(".npy") or filepath.endswith(".npz"): return np.load(filepath) if filepath.endswith(".mat"): global io if io is None: import scipy.io io = scipy.io try: return io.loadmat(filepath) except NotImplementedError, nei: if str(nei).find("HDF reader") != -1: global hdf_reader if hdf_reader is None: import h5py hdf_reader = h5py return hdf_reader.File(filepath) else: raise # this code should never be reached assert False
def __init__(self, dataset, model, algorithm=None, save_path=None, save_freq=0, extensions=None, allow_overwrite=True): """ Construct a Train instance. Parameters ---------- dataset : `pylearn2.datasets.dataset.Dataset` model : `pylearn2.models.model.Model` algorithm : <Optional> `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm` save_path : <Optional> str Path to save (with pickle / joblib) the model. save_freq : <Optional> int Frequency of saves, in epochs. A frequency of zero disables automatic saving altogether. A frequency of 1 saves every epoch. A frequency of 2 saves every other epoch, etc. (default=0, i.e. never save). Note: when automatic saving is enabled (eg save_freq > 0), the model is always saved after learning, even when the final epoch is not a multiple of `save_freq`. extensions : <Optional> iterable A collection of `TrainExtension` objects whose callbacks are triggered at various points in learning. allow_overwrite : <Optional> bool If `True`, will save the model to save_path even if there is already something there. Otherwise, will raise an error if the `save_path` is already occupied. """ self.allow_overwrite = allow_overwrite self.first_save = True self.dataset = dataset self.model = model self.algorithm = algorithm if save_path is not None: if save_freq == 0: warnings.warn('save_path specified but save_freq is 0 ' '(never save). Is this intentional?') self.save_path = preprocess(save_path) else: if save_freq > 0: phase_variable = 'PYLEARN2_TRAIN_PHASE' if phase_variable in os.environ: phase = 'phase%d' % os.environ[phase_variable] tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase, 'pkl'] else: tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl' self.save_path = '.'.join(tokens) self.save_freq = save_freq if hasattr(self.dataset, 'yaml_src'): self.model.dataset_yaml_src = self.dataset.yaml_src else: warnings.warn("dataset has no yaml src, model won't know what " + "data it was trained on") self.extensions = extensions if extensions is not None else [] self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch') self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
def __init__(self, dataset, model, algorithm=None, save_path=None, save_freq=0, extensions=None, allow_overwrite=True): self.allow_overwrite = allow_overwrite self.first_save = True self.dataset = dataset self.model = model self.algorithm = algorithm if save_path is not None: if save_freq == 0: warnings.warn('save_path specified but save_freq is 0 ' '(never save). Is this intentional?') self.save_path = preprocess(save_path) else: if save_freq > 0: phase_variable = 'PYLEARN2_TRAIN_PHASE' if phase_variable in os.environ: phase = 'phase%d' % os.environ[phase_variable] tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase, 'pkl'] else: tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl' self.save_path = '.'.join(tokens) self.save_freq = save_freq if hasattr(self.dataset, 'yaml_src'): self.model.dataset_yaml_src = self.dataset.yaml_src else: warnings.warn("dataset has no yaml src, model won't know what " + "data it was trained on") self.extensions = extensions if extensions is not None else [] self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch') self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
def __init__(self, path, n_labels=2, start=None, stop=None, del_raw=True, x_only=False): self.del_raw = del_raw path = preprocess(path) x, y = CSVDataset._load_data(path, del_raw=del_raw) if np.isnan(np.min(y)): y = None else: y = y.astype(int).reshape(-1, 1) if start is not None: if stop is None: stop = x.shape[0] assert start >= 0 assert start < stop if not (stop <= x.shape[0]): raise ValueError("stop must be less than the # of examples but " + "stop is " + str(stop) + " and there are " + str(x.shape[0]) + " examples.") x = x[start:stop, :] if y is not None: y = y[start:stop, :] if x_only: y = None n_labels = None super(CSVDataset, self).__init__(X=x, y=y, y_labels=n_labels)
def _unpickle(cls, file): """ .. todo:: What is this? why not just use serial.load like the CIFAR-100 class? Whoever wrote it shows up as "unknown" in git blame. """ from pylearn2.utils import string_utils fname = os.path.join(string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py', file) # fname = os.path.join('/Users/karino-t/data/cifar10/cifar-10-batches-py',file) if not os.path.exists(fname): raise IOError(fname+" was not found. You probably need to " "download the CIFAR-10 dataset by using the " "download script in " "pylearn2/scripts/datasets/download_cifar10.sh " "or manually from " "http://www.cs.utoronto.ca/~kriz/cifar.html") fname = cache.datasetCache.cache_file(fname) _logger.info('loading file %s' % fname) fo = open(fname, 'rb') dict = cPickle.load(fo) fo.close() return dict
def __init__(self, path = 'train.csv', one_hot = False, expect_labels = True, expect_headers = True, delimiter = ',', col_number = 10): """ .. todo:: WRITEME """ self.path = path self.one_hot = one_hot self.expect_labels = expect_labels self.expect_headers = expect_headers self.delimiter = delimiter self.col_number = col_number self.view_converter = None # and go self.path = preprocess(self.path) X, y = self._load_data() super(CSVModified, self).__init__(X=X, y=y)
def load(filepath, recurse_depth=0): try: import joblib joblib_available = True except ImportError: joblib_available = False if recurse_depth == 0: filepath = preprocess(filepath) if filepath.endswith('.npy'): return np.load(filepath) if filepath.endswith('.mat'): global io if io is None: import scipy.io io = scipy.io try: return io.loadmat(filepath) except NotImplementedError, nei: if str(nei).find('HDF reader') != -1: global hdf_reader if hdf_reader is None: import h5py hdf_reader = h5py return hdf_reader.File(filepath) else: raise #this code should never be reached assert False
def main(): base = '${PYLEARN2_DATA_PATH}/esp_game/ESPGame100k/labels/' base = preprocess(base) paths = sorted(os.listdir(base)) assert len(paths) == 100000 words = {} for i, path in enumerate(paths): if i % 1000 == 0: print(i) path = base+path f = open(path, 'r') lines = f.readlines() for line in lines: word = line[: -1] if word not in words: words[word] = 1 else: words[word] += 1 ranked_words = sorted(words.keys(), key=lambda x: -words[x]) ranked_words = [word_ + '\n' for word_ in ranked_words[0:4000]] f = open('wordlist.txt', 'w') f.writelines(ranked_words) f.close()
def __init__(self, save_dir): PYLEARN2_TRAIN_DIR = preprocess('${PYLEARN2_TRAIN_DIR}') PYLEARN2_TRAIN_BASE_NAME = preprocess('${PYLEARN2_TRAIN_BASE_NAME}') src = os.path.join(PYLEARN2_TRAIN_DIR, PYLEARN2_TRAIN_BASE_NAME) dst = os.path.join(save_dir, PYLEARN2_TRAIN_BASE_NAME) if not os.path.exists(save_dir): os.makedirs(save_dir) if os.path.exists(save_dir) and not os.path.isdir(save_dir): raise IOError("save path %s exists, not a directory" % save_dir) elif not os.access(save_dir, os.W_OK): raise IOError("permission error creating %s" % dst) with log_timing(log, 'copying yaml from {} to {}'.format(src, dst)): copyfile(src, dst)
def __enter__(self): if isinstance(self._f, basestring): self._f = preprocess(self._f) self._handle = open(self._f, self._mode, self._buffering) else: self._handle = self._f return self._handle
def show(image): """ Parameters ---------- image : PIL Image object or ndarray If ndarray, integer formats are assumed to use 0-255 and float formats are assumed to use 0-1 """ if hasattr(image, '__array__'): #do some shape checking because PIL just raises a tuple indexing error #that doesn't make it very clear what the problem is if len(image.shape) < 2 or len(image.shape) > 3: raise ValueError('image must have either 2 or 3 dimensions but its shape is '+str(image.shape)) if image.dtype == 'int8': image = np.cast['uint8'](image) elif str(image.dtype).startswith('float'): #don't use *=, we don't want to modify the input array image = image * 255. image = np.cast['uint8'](image) #PIL is too stupid to handle single-channel arrays if len(image.shape) == 3 and image.shape[2] == 1: image = image[:,:,0] try: ensure_Image() image = Image.fromarray(image) except TypeError: raise TypeError("PIL issued TypeError on ndarray of shape " + str(image.shape) + " and dtype " + str(image.dtype)) try: f = NamedTemporaryFile(mode='r', suffix='.png', delete=False) except TypeError: # before python2.7, we can't use the delete argument f = NamedTemporaryFile(mode='r', suffix='.png') """ TODO: prior to python 2.7, NamedTemporaryFile has no delete = False argument unfortunately, that means f.close() deletes the file. we then save an image to the file in the next line, so there's a race condition where for an instant we don't actually have the file on the filesystem reserving the name, and then write to that name anyway TODO: see if this can be remedied with lower level calls (mkstemp) """ warnings.warn('filesystem race condition') name = f.name f.flush() f.close() image.save(name) viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}') if os.name == 'nt': subprocess.Popen(viewer_command + ' ' + name +' && del ' + name, shell = True) else: subprocess.Popen(viewer_command + ' ' + name +' ; rm ' + name, shell = True)
def __init__(self, which_set, base_path = '${PYLEARN2_DATA_PATH}/hoge', start = None, stop = None, preprocessor = None, fit_preprocessor = False, axes = ('b', 0, 1, 'c'), fit_test_preprocessor = False): """ which_set: A string specifying which portion of the dataset to load. Valid values are 'train' or 'public_test' base_path: The directory containing the .csv files from kaggle.com. This directory should be writable; if the .csv files haven't already been converted to npy, this class will convert them to save memory the next time they are loaded. fit_preprocessor: True if the preprocessor is allowed to fit the data. fit_test_preprocessor: If we construct a test set based on this dataset, should it be allowed to fit the test set? """ self.test_args = locals() self.test_args['which_set'] = 'public_test' self.test_args['fit_preprocessor'] = fit_test_preprocessor del self.test_args['start'] del self.test_args['stop'] del self.test_args['self'] files = {'train': 'train.csv', 'public_test' : 'test.csv'} try: filename = files[which_set] except KeyError: raise ValueError("Unrecognized dataset name: " + which_set) path = base_path + '/' + filename path = preprocess(path) X, y = self._load_data(path, which_set == 'train') if start is not None: assert which_set != 'test' assert isinstance(start, int) assert isinstance(stop, int) assert start >= 0 assert start < stop assert stop <= X.shape[0] X = X[start:stop, :] if y is not None: y = y[start:stop, :] view_converter = DefaultViewConverter(shape=[48,48,1], axes=axes) super(HogeDataset, self).__init__(X=X, y=y, view_converter=view_converter) if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor)
def __init__(self, which_set = 'full', path = 'train.mat', one_hot = False, colorspace = 'none', step = 1, start = None, stop = None, center = False, rescale = False, gcn = None, toronto_prepro = False, axes=('b', 0, 1, 'c')): self.__dict__.update(locals()) del self.self # #self.one_hot = one_hot #self.colorspace = colorspace #self.step=step #self.which_set=which_set self.view_converter = None self.path = preprocess(self.path) X, y = self._load_data() if center: X -= 127.5 #self.center = center if rescale: X /= 127.5 #self.rescale = rescale if toronto_prepro: assert not center assert not gcn X = X / 255. if which_set == 'test': other = MATDATA(which_set='train') oX = other.X oX /= 255. X = X - oX.mean(axis=0) else: X = X - X.mean(axis=0) #self.toronto_prepro = toronto_prepro #self.gcn = gcn if gcn is not None: gcn = float(gcn) X = global_contrast_normalize(X, scale=gcn, min_divisor=1e-8) view_converter = DefaultViewConverter(( self.windowSize,self.windowSize,self.channels), axes) super(MATDATA, self).__init__(X=X, y=y, view_converter=view_converter)
def _unpickle(cls, file): from pylearn2.utils import string_utils fname = os.path.join(string_utils.preprocess("${PYLEARN2_DATA_PATH}"), "cifar10", "cifar-10-batches-py", file) _logger.info("loading file %s" % fname) fo = open(fname, "rb") dict = cPickle.load(fo) fo.close() return dict
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL10-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) del supplement print("Preparing output directory...") patch_dir = data_dir + '/stl10_patches_8x8' serial.mkdir(patch_dir) README = open(patch_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. data.pkl contains a pylearn2 Dataset object defining an unlabeled dataset of 2 million 6x6 approximately whitened, contrast-normalized patches drawn uniformly at random from a downsampled (to 32x32) version of the STL-10 train and unlabeled datasets. preprocessor.pkl contains a pylearn2 Pipeline object that was used to extract the patches and approximately whiten / contrast normalize them. This object is necessary when extracting features for supervised learning or test set classification, because the extracted features must be computed using inputs that have been whitened with the ZCA matrix learned and stored by this Pipeline. They were created with the pylearn2 script make_stl10_patches.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Preprocessing the data...") pipeline = preprocessing.Pipeline() pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=2*1000*1000)) pipeline.items.append( preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True)) pipeline.items.append(preprocessing.ZCA()) data.apply_preprocessor(preprocessor=pipeline, can_fit=True) data.use_design_loc(patch_dir + '/data.npy') serial.save(patch_dir + '/data.pkl', data) serial.save(patch_dir + '/preprocessor.pkl', pipeline)
def __init__(self, which_set, stop=None): assert which_set in ['train', 'valid'] self._stop = stop # TextDatasetMixin parameters self._unknown_index = 0 self._end_of_word_index = 100 self._case_sensitive = True with open(preprocess('${PYLEARN2_DATA_PATH}/word2vec/' 'char_vocab.pkl')) as f: self._vocabulary = cPickle.load(f) # Load the data with tables.open_file(preprocess('${PYLEARN2_DATA_PATH}/word2vec/' 'characters.h5')) as f: node = f.get_node('/characters_%s' % which_set) # VLArray is strange, and this seems faster than reading node[:] if self._stop is not None: self.X = np.asarray([char_sequence[:, np.newaxis] for char_sequence in node[:self._stop]]) else: self.X = np.asarray([char_sequence[:, np.newaxis] for char_sequence in node]) # Format is [batch, time, data] with tables.open_file(preprocess('${PYLEARN2_DATA_PATH}/word2vec/' 'embeddings.h5')) as f: node = f.get_node('/embeddings_%s' % which_set) if self._stop is not None: self.y = node[:self._stop] else: self.y = node[:] with open(preprocess('/data/lisatmp3/devincol/normalization.pkl')) as f: (means, stds) = cPickle.load(f) print "normalizing targets" self.y = (self.y - means)/stds source = ('features', 'targets') space = CompositeSpace([SequenceDataSpace(IndexSpace(dim=1, max_labels=101)), VectorSpace(dim=300)]) super(Word2Vec, self).__init__(data=(self.X, self.y), data_specs=(space, source))
def __init__(self, path = 'train', column = None, one_hot = False, with_labels = True, start = None, stop = None, preprocessor = None, fit_preprocessor = False, fit_test_preprocessor = False): """ which_set: A string specifying which portion of the dataset to load. Valid values are 'train' or 'public_test' base_path: The directory containing the .csv files from kaggle.com. This directory should be writable; if the .csv files haven't already been converted to npy, this class will convert them to save memory the next time they are loaded. fit_preprocessor: True if the preprocessor is allowed to fit the data. fit_test_preprocessor: If we construct a test set based on this dataset, should it be allowed to fit the test set? """ # self._iter_targets = True # whatever that means / won't work self.no_classes = 2 # won't work TODO self.test_args = locals() self.test_args['which_set'] = 'test' self.test_args['fit_preprocessor'] = fit_test_preprocessor del self.test_args['start'] del self.test_args['stop'] del self.test_args['self'] path = preprocess(path) X, y = self._load_data( path, column, with_labels ) if start is not None: assert which_set != 'test' assert isinstance(start, int) assert isinstance(stop, int) assert start >= 0 assert start < stop assert stop <= X.shape[0] X = X[start:stop, :] if y is not None: y = y[start:stop, :] super(TestDataset, self).__init__(X=X, y=y) if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor)
def main(): data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print('Loading CIFAR-100 train dataset...') train = CIFAR100(which_set='train', gcn=55.) print("Preparing output directory...") output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write(textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_cifar100_gcn_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() train.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the training data') train.use_design_loc(output_dir+'/train.npy') serial.save(output_dir + '/train.pkl', train) print("Loading the test data") test = CIFAR100(which_set='test', gcn=55.) print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir+'/test.npy') serial.save(output_dir+'/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def __init__(self, start=None, stop=None, axes=("b", 0, 1, "c"), stdev=0.8, hack=None, preproc="GCN"): # self.translation_dict = OrderedDict({1: 'left_eyebrow_inner_end', 2: 'mouth_top_lip_bottom', 3: 'right_ear_canal', 4: 'right_ear_top', 5: 'mouth_top_lip', 6: 'mouth_bottom_lip_top', 7: 'right_eyebrow_center', 8: 'chin_left', 9: 'nose_tip', 10: 'left_eyebrow_center_top', 11: 'left_eye_outer_corner', 12: 'right_ear', 13: 'mouth_bottom_lip', 14: 'left_eye_center', 15: 'left_mouth_outer_corner', 16: 'left_eye_center_top', 17: 'left_ear_center', 18: 'nostrils_center', 19: 'right_eye_outer_corner', 20: 'right_eye_center_bottom', 21: 'chin_center', 22: 'left_eye_inner_corner', 23: 'right_mouth_outer_corner', 24: 'left_ear_bottom', 25: 'right_eye_center_top', 26: 'right_eyebrow_inner_end', 27: 'left_eyebrow_outer_end', 28: 'left_ear_top', 29: 'right_ear_center', 30: 'nose_center_top', 31: 'face_center', 32: 'right_eye_inner_corner', 33: 'right_eyebrow_center_top', 34: 'left_eyebrow_center', 35: 'right_eye_pupil', 36: 'right_ear_bottom', 37: 'mouth_left_corner', 38: 'left_eye_center_bottom', 39: 'left_eyebrow_center_bottom', 41: 'mouth_right_corner', 42: 'right_nostril', 43: 'right_eye_center', 44: 'chin_right', 45: 'right_eyebrow_outer_end', 46: 'left_eye_pupil', 47: 'mouth_center', 48: 'left_nostril', 49: 'right_eyebrow_center_bottom', 50: 'left_ear_canal', 51: 'left_ear', 52: 'face_right', 53: 'face_left'}) self.name = hack self.stdev = stdev self.axes = axes self.pixels = numpy.arange(0, 96).reshape((1, 96)) for i in xrange(len(keypoints_names) * 2 - 1): self.pixels = numpy.vstack((self.pixels, numpy.arange(0, 96).reshape((1, 96)))) # self.which_set = which_set if hack is not None: X = LazyMemmap( preprocess("/Tmp/aggarwal/EmotiW_" + preproc + "_" + hack + ".npy"), dtype="float32", mode="c" ) else: X = LazyMemmap(preprocess("${PYLEARN2_DATA_PATH}/faces/hdf5/complete_train_x.npy"), dtype="uint8", mode="c") Y = LazyMemmap( preprocess("${PYLEARN2_DATA_PATH}/faces/hdf5/complete_train_y.npy"), dtype=numpy.float32, mode="c" ) num_examples = len(X) / (96.0 * 96.0 * 3.0) if stop is None: stop = num_examples if start is None: start = 0 X = X.view()[start * 96 * 96 * 3 : stop * 96 * 96 * 3] Y = Y.view()[start * len(keypoints_names) * 2 : stop * len(keypoints_names) * 2] X.shape = (stop - start, 96 * 96 * 3) # print 'shape of X', X.mean(axis = 1).shape Y.shape = (stop - start, len(keypoints_names) * 2) if hack is not None: Y = self.make_targets(Y, hack) else: Y = self.make_targets(Y, "all") super(EmotiwKeypoints, self).__init__( X=X, y=Y, view_converter=DefaultViewConverter(shape=[96, 96, 3], axes=axes) )
def __init__(self, which_set, data_path=None, term_range=None, target_type='cluster100'): """ which_set: a string specifying which portion of the dataset to load. Valid values are 'train', 'valid' or 'test' data_path: a string specifying the directory containing the webcluster data. If None (default), use environment variable WEBCLUSTER_DATA_PATH. term_range: a tuple for taking only a slice of the available terms. Default is to use all 6275. For example, an input range of (10,2000) will truncate the 10 most frequent terms and the 6275-2000=4275 les frequent terms, whereby frequency we mean how many unique documents each term is in. target_type: the type of targets to use. Valid options are 'cluster[10,100,1000]' """ self.__dict__.update(locals()) del self.self self.corpus_terms = None self.doc_info = None print "loading WebCluster DDM. which_set =", self.which_set if self.data_path is None: self.data_path \ = string_utils.preprocess('${WEBCLUSTER_DATA_PATH}') fname = os.path.join(self.data_path, which_set+'_doc_inputs.npy') X = np.load(fname) if self.term_range is not None: X = X[:,self.term_range[0]:self.term_range[1]] X = X/X.sum(1).reshape(X.shape[0],1) print X.sum(1).mean() fname = os.path.join(self.data_path, which_set+'_doc_targets.npy') # columns: 0:cluster10s, 1:cluster100s, 2:cluster1000s self.cluster_hierarchy = np.load(fname) y = None if self.target_type == 'cluster10': y = self.cluster_hierarchy[:,0] elif self.target_type == 'cluster100': y = self.cluster_hierarchy[:,1] elif self.target_type == 'cluster1000': y = self.cluster_hierarchy[:,2] elif self.target_type is None: pass else: raise NotImplementedError() DenseDesignMatrix.__init__(self, X=X, y=y) print "... WebCluster ddm loaded"
def get_relative_path(full_path): """ Returns the relative path to the PYLEARN2_DATA_PATH. """ data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') if not memmap.filename.startswith(data_dir): raise ValueError("Expected memmap.filename to start with " "the PYLEARN2_DATA_PATH (%s). Instead it " "was %s." % (data_dir, memmap.filename)) return os.path.relpath(full_path, data_dir)
def _unpickle(cls, file): from pylearn2.utils import string_utils fname = os.path.join( string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py', file) _logger.info('loading file %s' % fname) fo = open(fname, 'rb') dict = cPickle.load(fo) fo.close() return dict
def _getVar(key, environ=None): """ Looks for a key in custom and os environments. Parameters ---------- key : str The key to look for. environ : dict, optional A custom dictionary to search befor system environment. Returns ------- None if the key was not found, a string otherwise. """ if environ: if environ.has_key(key): return string_utils.preprocess(environ[key], environ=environ) if os.environ.has_key(): return string_utils.preprocess(os.environ[key]) return None
def save(filepath, obj, on_overwrite = 'ignore'): """ Serialize `object` to a file denoted by `filepath`. Parameters ---------- filepath : str A filename. If the suffix is `.joblib` and joblib can be imported, `joblib.dump` is used in place of the regular pickling mechanisms; this results in much faster saves by saving arrays as separate .npy files on disk. If the file suffix is `.npy` than `numpy.save` is attempted on `obj`. Otherwise, (c)pickle is used. obj : object A Python object to be serialized. on_overwrite : str, optional A string specifying what to do if the file already exists. Possible values include: - "ignore" : Just overwrite the existing file. - "backup" : Make a backup copy of the file (<filepath>.bak). Save the new copy. Then delete the backup copy. This allows recovery of the old version of the file if saving the new one fails. """ filepath = preprocess(filepath) if os.path.exists(filepath): if on_overwrite == 'backup': backup = filepath + '.bak' shutil.move(filepath, backup) save(filepath, obj) try: os.remove(backup) except Exception, e: warnings.warn("Got an error while traing to remove "+backup+":"+str(e)) return else: assert on_overwrite == 'ignore'
def load(stream, overrides=None, **kwargs): """ Loads a YAML configuration from a string or file-like object. Parameters ---------- stream : str or object Either a string containing valid YAML or a file-like object supporting the .read() interface. overrides : dict, optional A dictionary containing overrides to apply. The location of the override is specified in the key as a dot-delimited path to the desired parameter, e.g. "model.corruptor.corruption_level". Returns ------- graph : dict or object The dictionary or object (if the top-level element specified an Python object to instantiate). Notes ----- Other keyword arguments are passed on to `yaml.load`. """ global is_initialized if not is_initialized: initialize() if isinstance(stream, basestring): string = stream else: string = '\n'.join(stream.readlines()) processed_string = preprocess(string) proxy_graph = yaml.load(processed_string, **kwargs) #import pdb; pdb.set_trace() if overrides is not None: handle_overrides(proxy_graph, overrides) return instantiate_all(proxy_graph)
def _instantiate(proxy, bindings=None): """ Instantiate a (hierarchy of) Proxy object(s). Parameters ---------- proxy : object A `Proxy` object or list/dict/literal. Strings are run through `preprocess`. bindings : dict, opitonal A dictionary mapping previously instantiated `Proxy` objects to their instantiated values. Returns ------- obj : object The result object from recursively instantiating the object DAG. Notes ----- This should not be considered part of the stable, public API. """ if bindings is None: bindings = {} if isinstance(proxy, Proxy): return _instantiate_proxy_tuple(proxy, bindings) elif isinstance(proxy, dict): # Recurse on the keys too, for backward compatibility. # Is the key instantiation feature ever actually used, by anyone? return dict((_instantiate(k, bindings), _instantiate(v, bindings)) for k, v in proxy.iteritems()) elif isinstance(proxy, list): return [_instantiate(v, bindings) for v in proxy] # In the future it might be good to consider a dict argument that provides # a type->callable mapping for arbitrary transformations like this. elif isinstance(proxy, basestring): return preprocess(proxy) else: return proxy
def _unpickle(cls, file): """ .. todo:: What is this? why not just use serial.load like the CIFAR-100 class? Whoever wrote it shows up as "unknown" in git blame. """ from pylearn2.utils import string_utils fname = os.path.join( string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py', file) if not os.path.exists(fname): raise IOError(fname+" was not found. You probably need to download " "the CIFAR-10 dataset by using the download script in pylearn2/scripts/download_cifar10.sh " "or manually from http://www.cs.utoronto.ca/~kriz/cifar.html") _logger.info('loading file %s' % fname) fo = open(fname, 'rb') dict = cPickle.load(fo) fo.close() return dict
def __init__(self, npy_filename, which_set, split): assert which_set in ['train', 'valid', 'test'] self.split = split # Load data from .npy file npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'), 'icml07data', 'npy', npy_filename) x_file = npy_filename_root + '_inputs.npy' y_file = npy_filename_root + '_labels.npy' x_file = datasetCache.cache_file(x_file) y_file = datasetCache.cache_file(y_file) data_x = np.load(x_file, mmap_mode='r') data_y = np.load(y_file, mmap_mode='r') # some sanity checkes assert np.isfinite(data_x).all() assert np.isfinite(data_y).all() assert data_x.shape[0] == data_y.shape[0] # extract n_train, n_valid, n_test = split sets = { 'train': (0, n_train), 'valid': (n_train, n_train + n_valid), 'test': (n_train + n_valid, n_train + n_valid + n_test) } start, end = sets[which_set] data_x = data_x[start:end] data_y = data_y[start:end] view_converter = DefaultViewConverter((28, 28, 1)) super(ICML07DataSet, self).__init__(X=data_x, y=data_y, y_labels=data_y.max() + 1, view_converter=view_converter)
def __init__(self, which_set, center=False, scale=False, start=None, stop=None, axes=('b', 0, 1, 'c'), preprocessor = None): assert which_set in self.mapper.keys() self.__dict__.update(locals()) del self.self path = '${PYLEARN2_DATA_PATH}/SVHN/format2/' # load data path = preprocess(path) data_x, data_y = self.make_data(which_set, path) # rescale or center if permitted if center and scale: data_x -= 127.5 data_x /= 127.5 elif center: data_x -= 127.5 elif scale: data_x /= 255. view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) super(SVHN_On_Memory, self).__init__(X=data_x, y=data_y, view_converter=view_converter) if preprocessor: if which_set in ['train', 'train_all', 'splitted_train']: can_fit = True else: can_fit = False preprocessor.apply(self, can_fit) del data_x, data_y gc.collect()
def __init__(self, which_set, standardize_quantitative=True, separate_types=False, prefix=None, one_hot=False): if separate_types: raise NotImplementedError("This won't work as long as this " "is a subset of DenseDesignMatrix") self._separate_types = separate_types self._standardize_quantitative = standardize_quantitative self._prefix = prefix self._one_hot = one_hot prefix = prefix if prefix is not None else "${PYLEARN2_DATA_PATH}" self._raw = load_covertype( preprocess(os.path.join(prefix, "covertype")), which_sets=which_set, separate_types=self._separate_types, standardize_quantitative=self._standardize_quantitative ) labels = self._raw[which_set]['labels'] - 1 # 0 - 6, not 1 - 7 if one_hot: labels = one_hot(labels, max_label=6) super(CoverType, self).__init__( X=self._raw[which_set]['features'], y=labels )
def load_ndarray_label(name): """ Load the train,valid,test label data for the dataset `name` and return it in ndarray format. This is only available for the toy dataset ule. Parameters ---------- name : 'ule' Must be 'ule' Returns ------- train_l. valid_l, test_l : ndarray Label data loaded """ assert name in ['ule'] common_path = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'), 'UTLC', 'filetensor', name+'_') trname,vname,tename = [common_path+subset+'.tf' for subset in ['trainl','validl','testl']] trainl = load_filetensor(trname) validl = load_filetensor(vname) testl = load_filetensor(tename) return trainl, validl, testl
def __init__(self, path='train.mat', start=None, stop=None, center=False, rescale=False, axes=('b', 0, 1, 'c'), channels=4): self.__dict__.update(locals()) del self.self self.filters = tables.Filters(complib='blosc', complevel=5) self.view_converter = None self.path = preprocess(self.path) X, y = self._load_data() self.windowSize = np.uint8(np.sqrt(X.shape[1] / 4)) if center and rescale: X[:] -= 127.5 X[:] /= 127.5 elif center: X[:] -= 127.5 elif rescale: X[:] /= 255. view_converter = DefaultViewConverter((61, 61, 4), axes) super(MATDATAPyTables, self).__init__(X=X, y=y, view_converter=view_converter) self.h5file.flush()
def __init__(self, path, expect_headers=False, delimiter=",", which_set="train"): """ @param path: path of a data, should be a pkl file (str) @param expect_headers: if there is a header on the first row (bool) @param delimiter: delimiter of the data (str) @param which_set: specify which set is using (total, train, valid, test) """ self.path = path self.delimiter = delimiter self.expect_headers = expect_headers self.which_set = which_set self.path = preprocess(self.path) X, y = self._load_data() start = 0 end = X.shape[0] if self.which_set == "train": start = 0 end *= 0.6 elif self.which_set == "valid": start = end * 0.6 end *= 0.8 elif self.which_set == "test": start = end * 0.8 X = X[start:end, :] y = y[start:end, :] super(IFDataset, self).__init__(X=X, y=y)
def Transform(): """Test smaller version of convolutional_network.ipynb""" which_experiment = 'S100' skip.skip_if_no_data() yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}') save_path = os.path.join(data_dir, 'cifar10', 'experiment_'+string.lower(which_experiment)) base_save_path = os.path.join(data_dir, 'cifar10') # Escape potential backslashes in Windows filenames, since # they will be processed when the YAML parser will read it # as a string #save_path.replace('\\', r'\\') yaml = open("{0}/experiment_base_transform.yaml".format(yaml_file_path), 'r').read() hyper_params = {'batch_size': 64, 'output_channels_h1': 64, 'output_channels_h2': 128, 'output_channels_h3': 600, 'max_epochs': 100, 'save_path': save_path, 'base_save_path' : base_save_path } yaml = yaml % (hyper_params) train = yaml_parse.load(yaml) train.main_loop()
def __init__(self, which_set, base_path='${PYLEARN2_DATA_PATH}/icml_2013_black_box', start=None, stop=None, preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): """ which_set: A string specifying which portion of the dataset to load. Valid values are 'train' or 'public_test' base_path: The directory containing the .csv files from kaggle.com. This directory should be writable; if the .csv files haven't already been converted to npy, this class will convert them to save memory the next time they are loaded. fit_preprocessor: True if the preprocessor is allowed to fit the data. fit_test_preprocessor: If we construct a test set based on this dataset, should it be allowed to fit the test set? """ self.test_args = locals() self.test_args['which_set'] = 'public_test' self.test_args['fit_preprocessor'] = fit_test_preprocessor del self.test_args['start'] del self.test_args['stop'] del self.test_args['self'] files = {'train': 'train.csv', 'public_test': 'test.csv'} sizes = {'train': 1000, 'public_test': 10000, 'extra': 135735} if which_set == 'extra': path = base_path + '/' + 'extra_unsupervised_data.npy' X = serial.load(path).T y = None else: try: filename = files[which_set] except KeyError: raise ValueError("Unrecognized dataset name: " + which_set) path = base_path + '/' + filename path = preprocess(path) expect_labels = which_set == 'train' X, y = self._load_data(path, expect_labels) size = sizes[which_set] if X.shape[0] != size: raise ValueError("Expected " + str(size) + " examples, got " + str(X.shape[0])) if start is not None: assert which_set != 'test' assert isinstance(start, int) assert isinstance(stop, int) assert start >= 0 assert start < stop if not (stop <= X.shape[0]): raise ValueError( "stop must be less than the # of examples but " + "stop is " + str(stop) + " and there are " + str(X.shape[0]) + " examples.") X = X[start:stop, :] if y is not None: y = y[start:stop, :] super(BlackBoxDataset, self).__init__(X=X, y=y, y_labels=9) if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor)
def __init__(self, which_set, path = None, center = False, scale = False, start = None, stop = None, axes = ('b', 0, 1, 'c'), preprocessor = None): """ Only for faster access there is a copy of hdf5 file in PYLEARN2_DATA_PATH but it mean to be only readable. If you wish to modify the data, you should pass a local copy to the path argument. """ assert which_set in self.mapper.keys() self.__dict__.update(locals()) del self.self if path is None: path = '${PYLEARN2_DATA_PATH}/SVHN/format2/' mode = 'r' else: mode = 'r+' logging.warning("Because path is not same as PYLEARN2_DATA_PATH "\ "be aware that data might have been modified or pre-processed.") if mode == 'r' and (scale or center or (start != None) or (stop != None)): raise ValueError("Only for speed there is a copy of hdf5 " +\ "file in PYLEARN2_DATA_PATH but it meant to be only " +\ "readable. If you wish to modify the data, you should " +\ "pass a local copy to the path argument.") # load data path = preprocess(path) file_n = "{}_32x32.h5".format(os.path.join(path, "h5", which_set)) if os.path.isfile(file_n): make_new = False else: make_new = True warnings.warn("Over riding existing file: {}".format(file_n)) # if hdf5 file does not exist make them if make_new: self.filters = tables.Filters(complib='blosc', complevel=5) self.make_data(which_set, path) self.h5file = tables.openFile(file_n, mode = mode) data = self.h5file.getNode('/', "Data") if start != None or stop != None: self.h5file, data = self.resize(self.h5file, start, stop) # rescale or center if permitted if center and scale: data.X[:] -= 127.5 data.X[:] /= 127.5 elif center: data.X[:] -= 127.5 elif scale: data.X[:] /= 255. view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) super(SVHN, self).__init__(X = data.X, y = data.y, view_converter = view_converter) if preprocessor: if which_set in ['train', 'train_all', 'splitted_train']: can_fit = True preprocessor.apply(self, can_fit) self.h5file.flush()
python extract_layer_2_kmeans_features.py private_test to extract features for the ICML 2013 multimodal learning contest's private test images (which will be released 72 hours before the contest ends) """) if len(sys.argv) != 2: usage() print('(You used the wrong number of arguments)') quit(-1) _, arg = sys.argv if arg == 'public_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_layer_1_features' ) expected_num_images = 500 elif arg == 'private_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_layer_1_features' ) expected_num_images = 500 else: usage() print('Unrecognized argument value:', arg) print('Recognized values are: public_test, private_test') outdir = base[:-len('layer_1_features')] + 'layer_2_features' serial.mkdir(outdir)
def __init__(self, which_set, center=False, rescale=False, gcn=None, start=None, stop=None, axes=('b', 0, 1, 'c'), toronto_prepro=False, preprocessor=None): # note: there is no such thing as the cifar10 validation set; # pylearn1 defined one but really it should be user-configurable # (as it is here) self.axes = axes # we define here: dtype = 'uint8' ntrain = 50000 nvalid = 0 # artefact, we won't use it ntest = 10000 # we also expose the following details: self.img_shape = (3, 32, 32) self.img_size = numpy.prod(self.img_shape) self.n_classes = 10 self.label_names = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] # prepare loading fnames = ['data_batch_%i' % i for i in range(1, 6)] datasets = {} datapath = os.path.join( string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10', 'cifar-10-batches-py') for name in fnames + ['test_batch']: fname = os.path.join(datapath, name) if not os.path.exists(fname): raise IOError(fname + " was not found. You probably need to " "download the CIFAR-10 dataset by using the " "download script in " "pylearn2/scripts/datasets/download_cifar10.sh " "or manually from " "http://www.cs.utoronto.ca/~kriz/cifar.html") datasets[name] = cache.datasetCache.cache_file(fname) lenx = int(numpy.ceil((ntrain + nvalid) / 10000.) * 10000) x = numpy.zeros((lenx, self.img_size), dtype=dtype) y = numpy.zeros((lenx, 1), dtype=dtype) # load train data nloaded = 0 for i, fname in enumerate(fnames): _logger.info('loading file %s' % datasets[fname]) data = serial.load(datasets[fname]) x[i * 10000:(i + 1) * 10000, :] = data['data'] y[i * 10000:(i + 1) * 10000, 0] = data['labels'] nloaded += 10000 if nloaded >= ntrain + nvalid + ntest: break # load test data _logger.info('loading file %s' % datasets['test_batch']) data = serial.load(datasets['test_batch']) # process this data Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]} Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]} X = numpy.cast['float32'](Xs[which_set]) y = Ys[which_set] if isinstance(y, list): y = numpy.asarray(y).astype(dtype) if which_set == 'test': assert y.shape[0] == 10000 y = y.reshape((y.shape[0], 1)) if center: X -= 127.5 self.center = center if rescale: X /= 127.5 self.rescale = rescale if toronto_prepro: assert not center assert not gcn X = X / 255. if which_set == 'test': other = CIFAR10(which_set='train') oX = other.X oX /= 255. X = X - oX.mean(axis=0) else: X = X - X.mean(axis=0) self.toronto_prepro = toronto_prepro self.gcn = gcn if gcn is not None: gcn = float(gcn) X = global_contrast_normalize(X, scale=gcn) if start is not None: # This needs to come after the prepro so that it doesn't # change the pixel means computed above for toronto_prepro assert start >= 0 assert stop > start assert stop <= X.shape[0] X = X[start:stop, :] y = y[start:stop, :] assert X.shape[0] == y.shape[0] if which_set == 'test': assert X.shape[0] == 10000 view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) super(CIFAR10, self).__init__(X=X, y=y, view_converter=view_converter, y_labels=self.n_classes) assert not contains_nan(self.X) if preprocessor: preprocessor.apply(self)
This script also translates the data to lie in [-127.5, 127.5] instead of [0,255]. This makes it play nicer with some of pylearn's visualization tools. """ from __future__ import print_function from theano.compat.six.moves import xrange from pylearn2.datasets.stl10 import STL10 from pylearn2.datasets.preprocessing import Downsample from pylearn2.utils import string_utils as string from pylearn2.utils import serial import numpy as np print('Preparing output directory...') data_dir = string.preprocess('${PYLEARN2_DATA_PATH}') downsampled_dir = data_dir + '/stl10_32x32' serial.mkdir(downsampled_dir) README = open(downsampled_dir + '/README', 'w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. They contain pylearn2 Dataset objects defining the STL-10 dataset, but downsampled to size 32x32 and translated to lie in [-127.5, 127.5 ]. They were created with the pylearn2 script make_downsampled_stl10.py All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly.
def show(image): """ Parameters ---------- image : PIL Image object or ndarray If ndarray, integer formats are assumed to use 0-255 \ and float formats are assumed to use 0-1 """ if hasattr(image, '__array__'): #do some shape checking because PIL just raises a tuple indexing error #that doesn't make it very clear what the problem is if len(image.shape) < 2 or len(image.shape) > 3: raise ValueError('image must have either 2 or 3 dimensions but its' ' shape is ' + str(image.shape)) if image.dtype == 'int8': image = np.cast['uint8'](image) elif str(image.dtype).startswith('float'): #don't use *=, we don't want to modify the input array image = image * 255. image = np.cast['uint8'](image) #PIL is too stupid to handle single-channel arrays if len(image.shape) == 3 and image.shape[2] == 1: image = image[:,:,0] try: ensure_Image() image = Image.fromarray(image) except TypeError: raise TypeError("PIL issued TypeError on ndarray of shape " + str(image.shape) + " and dtype " + str(image.dtype)) try: f = NamedTemporaryFile(mode='r', suffix='.png', delete=False) except TypeError: # before python2.7, we can't use the delete argument f = NamedTemporaryFile(mode='r', suffix='.png') """ TODO: prior to python 2.7, NamedTemporaryFile has no delete = False argument unfortunately, that means f.close() deletes the file. we then save an image to the file in the next line, so there's a race condition where for an instant we don't actually have the file on the filesystem reserving the name, and then write to that name anyway TODO: see if this can be remedied with lower level calls (mkstemp) """ warnings.warn('filesystem race condition') name = f.name f.flush() f.close() image.save(name) viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}') if os.name == 'nt': subprocess.Popen(viewer_command + ' ' + name +' && del ' + name, shell=True) else: subprocess.Popen(viewer_command + ' ' + name +' ; rm ' + name, shell=True)
""" This script makes a dataset of 32x32 contrast normalized, approximately whitened CIFAR-100 images. """ from pylearn2.utils import serial from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils from pylearn2.datasets.cifar100 import CIFAR100 data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100') print 'Loading CIFAR-100 train dataset...' train = CIFAR100(which_set = 'train', gcn = 55.) print "Preparing output directory..." output_dir = data_dir + '/pylearn2_gcn_whitened' serial.mkdir( output_dir ) README = open(output_dir + '/README','w') README.write(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. train.pkl, and test.pkl each contain a pylearn2 Dataset object defining a labeled dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100 dataset. train.pkl contains labeled train examples. test.pkl contains labeled test examples.
def _load(filepath, recurse_depth=0, retry=True): """ Recursively tries to load a file until success or maximum number of attempts. Parameters ---------- filepath : str A path to a file to load. Should be a pickle, Matlab, or NumPy file; or a .txt or .amat file that numpy.loadtxt can load. recurse_depth : int, optional End users should not use this argument. It is used by the function itself to implement the `retry` option recursively. retry : bool, optional If True, will make a handful of attempts to load the file before giving up. This can be useful if you are for example calling show_weights.py on a file that is actively being written to by a training script--sometimes the load attempt might fail if the training script writes at the same time show_weights tries to read, but if you try again after a few seconds you should be able to open the file. Returns ------- loaded_object : object The object that was stored in the file. """ try: import joblib joblib_available = True except ImportError: joblib_available = False if recurse_depth == 0: filepath = preprocess(filepath) if filepath.endswith('.npy') or filepath.endswith('.npz'): return np.load(filepath) if filepath.endswith('.amat') or filepath.endswith('txt'): try: return np.loadtxt(filepath) except Exception: reraise_as("{0} cannot be loaded by serial.load (trying " "to use np.loadtxt)".format(filepath)) if filepath.endswith('.mat'): global io if io is None: import scipy.io io = scipy.io try: return io.loadmat(filepath) except NotImplementedError as nei: if str(nei).find('HDF reader') != -1: global hdf_reader if hdf_reader is None: import h5py hdf_reader = h5py return hdf_reader.File(filepath, 'r') else: raise # this code should never be reached assert False # for loading PY2 pickle in PY3 encoding = {'encoding': 'latin-1'} if six.PY3 else {} def exponential_backoff(): if recurse_depth > 9: logger.info('Max number of tries exceeded while trying to open ' '{0}'.format(filepath)) logger.info('attempting to open via reading string') with open(filepath, 'rb') as f: content = f.read() return cPickle.loads(content, **encoding) else: nsec = 0.5 * (2.0 ** float(recurse_depth)) logger.info("Waiting {0} seconds and trying again".format(nsec)) time.sleep(nsec) return _load(filepath, recurse_depth + 1, retry) try: if not joblib_available: with open(filepath, 'rb') as f: obj = cPickle.load(f, **encoding) else: try: obj = joblib.load(filepath) except Exception as e: if os.path.exists(filepath) and not os.path.isdir(filepath): raise raise_cannot_open(filepath) except MemoryError as e: # We want to explicitly catch this exception because for MemoryError # __str__ returns the empty string, so some of our default printouts # below don't make a lot of sense. # Also, a lot of users assume any exception is a bug in the library, # so we can cut down on mail to pylearn-users by adding a message # that makes it clear this exception is caused by their machine not # meeting requirements. if os.path.splitext(filepath)[1] == ".pkl": improve_memory_error_message(e, ("You do not have enough memory to " "open %s \n" " + Try using numpy.{save,load} " "(file with extension '.npy') " "to save your file. It uses less " "memory when reading and " "writing files than pickled files.") % filepath) else: improve_memory_error_message(e, "You do not have enough memory to " "open %s" % filepath) except (BadPickleGet, EOFError, KeyError) as e: if not retry: reraise_as(e.__class__('Failed to open {0}'.format(filepath))) obj = exponential_backoff() except ValueError: logger.exception if not retry: reraise_as(ValueError('Failed to open {0}'.format(filepath))) obj = exponential_backoff() except Exception: # assert False reraise_as("Couldn't open {0}".format(filepath)) # if the object has no yaml_src, we give it one that just says it # came from this file. could cause trouble if you save obj again # to a different location if not hasattr(obj, 'yaml_src'): try: obj.yaml_src = '!pkl: "' + os.path.abspath(filepath) + '"' except Exception: pass return obj
def __init__(self, which_set, center=False, custom_path=None): assert which_set in ['train', 'unlabeled', 'custom'] path = "${PYLEARN2_DATA_PATH}/TLChallenge" if which_set == 'train': path += '/training/training-data.dat' elif which_set == 'unlabeled': path += '/unlabelled_tiny.dat' elif which_set == 'custom': path = custom_path path = preprocess(path) X = N.fromfile(path, dtype=N.uint8, sep=' ') X = X.reshape(X.shape[0] / (32 * 32 * 3), 32 * 32 * 3, order='F') assert X.max() == 255 assert X.min() == 0 X = N.cast['float32'](X) y = None if center: X -= 127.5 view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3)) X = view_converter.design_mat_to_topo_view(X) X = N.transpose(X, (0, 2, 1, 3)) X = view_converter.topo_view_to_design_mat(X) super(TL_Challenge, self).__init__(X=X, y=y, view_converter=view_converter) assert not N.any(N.isnan(self.X)) if which_set == 'train': self.y_fine = N.fromfile(preprocess( "${PYLEARN2_DATA_PATH}/TLChallenge/training/training-labels.dat" ), dtype=N.uint8, sep=' ') assert len(self.y_fine.shape) == 1 assert self.y_fine.shape[0] == X.shape[0] #0 : aquatic_mammals #1 : fish #2 : flowers FOOD_CONTAINER = 3 FRUIT = 4 #5 : household_electrical_devices FURNITURE = 6 INSECTS = 7 #8 : large_carnivores #9 : large_man-made_outdoor_things #10 : large_natural_outdoor_scenes LARGE_OMNIVORES_HERBIVORES = 11 MEDIUM_MAMMAL = 12 #13 : non-insect_invertebrates #14 : people #15 : reptiles #16 : small_mammals #17 : trees #18 : vehicles_1 #19 : vehicles_2 self.y_coarse = self.y_fine.copy() self.y_coarse[self.y_coarse == 100] = INSECTS self.y_coarse[self.y_coarse == 101] = LARGE_OMNIVORES_HERBIVORES self.y_coarse[self.y_coarse == 102] = LARGE_OMNIVORES_HERBIVORES self.y_coarse[self.y_coarse == 103] = LARGE_OMNIVORES_HERBIVORES self.y_coarse[self.y_coarse == 104] = FRUIT self.y_coarse[self.y_coarse == 105] = FOOD_CONTAINER self.y_coarse[self.y_coarse == 106] = FRUIT self.y_coarse[self.y_coarse == 107] = MEDIUM_MAMMAL self.y_coarse[self.y_coarse == 108] = FRUIT self.y_coarse[self.y_coarse == 109] = FURNITURE assert self.y_coarse.min() == 3 assert self.y_coarse.max() == 12 for i in xrange(120): if self.y_coarse[i] == FRUIT: assert self.y_fine[i] in [104, 106, 108]
def save(filepath, obj, on_overwrite='ignore'): """ Serialize `object` to a file denoted by `filepath`. Parameters ---------- filepath : str A filename. If the suffix is `.joblib` and joblib can be imported, `joblib.dump` is used in place of the regular pickling mechanisms; this results in much faster saves by saving arrays as separate .npy files on disk. If the file suffix is `.npy` than `numpy.save` is attempted on `obj`. Otherwise, (c)pickle is used. obj : object A Python object to be serialized. on_overwrite: A string specifying what to do if the file already exists. ignore: just overwrite it backup: make a copy of the file (<filepath>.bak) and delete it when done saving the new copy. this allows recovery of the old version of the file if saving the new one fails """ filepath = preprocess(filepath) if os.path.exists(filepath): if on_overwrite == 'backup': backup = filepath + '.bak' shutil.move(filepath, backup) save(filepath, obj) os.remove(backup) return else: assert on_overwrite == 'ignore' try: _save(filepath, obj) except RuntimeError, e: """ Sometimes for large theano graphs, pickle/cPickle exceed the maximum recursion depth. This seems to me like a fundamental design flaw in pickle/cPickle. The workaround I employ here is the one recommended to someone who had a similar problem on stackexchange: http://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pythons-pickle-cpickle Obviously this does not scale and could cause a crash but I don't see another solution short of writing our own implementation of pickle. """ if str(e).find('recursion') != -1: warnings.warn('pylearn2.utils.save encountered the following ' 'error: ' + str(e) + '\nAttempting to resolve this error by calling ' + 'sys.setrecusionlimit and retrying') old_limit = sys.getrecursionlimit() try: sys.setrecursionlimit(50000) _save(filepath, obj) finally: sys.setrecursionlimit(old_limit)
def __init__(self, dataset, model, algorithm=None, save_path=None, save_freq=0, extensions=None, allow_overwrite=True): """ Construct a Train instance. Parameters ---------- dataset : `pylearn2.datasets.dataset.Dataset` model : `pylearn2.models.model.Model` algorithm : <Optional> `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm` save_path : <Optional> str Path to save (with pickle / joblib) the model. save_freq : <Optional> int Frequency of saves, in epochs. A frequency of zero disables automatic saving altogether. A frequency of 1 saves every epoch. A frequency of 2 saves every other epoch, etc. (default=0, i.e. never save). Note: when automatic saving is enabled (eg save_freq > 0), the model is always saved after learning, even when the final epoch is not a multiple of `save_freq`. extensions : <Optional> iterable A collection of `TrainExtension` objects whose callbacks are triggered at various points in learning. allow_overwrite : <Optional> bool If `True`, will save the model to save_path even if there is already something there. Otherwise, will raise an error if the `save_path` is already occupied. """ self.allow_overwrite = allow_overwrite self.first_save = True self.dataset = dataset self.model = model self.algorithm = algorithm if save_path is not None: if save_freq == 0: warnings.warn('save_path specified but save_freq is 0 ' '(never save). Is this intentional?') self.save_path = preprocess(save_path) else: if save_freq > 0: phase_variable = 'PYLEARN2_TRAIN_PHASE' if phase_variable in os.environ: phase = 'phase%d' % os.environ[phase_variable] tokens = [ os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase, 'pkl' ] else: tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl' self.save_path = '.'.join(tokens) self.save_freq = save_freq if hasattr(self.dataset, 'yaml_src'): self.model.dataset_yaml_src = self.dataset.yaml_src else: warnings.warn("dataset has no yaml src, model won't know what " + "data it was trained on") self.extensions = extensions if extensions is not None else [] self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch') self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
def __init__( self, which_set, base_path='/data/vision/billf/manifold-learning/DL/Data/icml_2013_emotions', start=None, stop=None, preprocessor=None, fit_preprocessor=False, axes=('b', 0, 1, 'c'), fit_test_preprocessor=False, randindex=None, trainindex=None): """ which_set: A string specifying which portion of the dataset to load. Valid values are 'train' or 'public_test' base_path: The directory containing the .csv files from kaggle.com. This directory should be writable; if the .csv files haven't already been converted to npy, this class will convert them to save memory the next time they are loaded. fit_preprocessor: True if the preprocessor is allowed to fit the data. fit_test_preprocessor: If we construct a test set based on this dataset, should it be allowed to fit the test set? """ self.test_args = locals() self.test_args['which_set'] = 'public_test' self.test_args['fit_preprocessor'] = fit_test_preprocessor del self.test_args['start'] del self.test_args['stop'] del self.test_args['self'] files = {'train': 'train.csv', 'public_test': 'test.csv'} try: filename = files[which_set] except KeyError: raise ValueError("Unrecognized dataset name: " + which_set) path = base_path + '/' + filename path = preprocess(path) X, y = self._load_data(path, which_set == 'train') if start is not None: assert which_set != 'test' assert isinstance(start, int) assert isinstance(stop, int) assert start >= 0 assert start < stop assert stop <= X.shape[0] X = X[start:stop, :] if y is not None: y = y[start:stop, :] """ if trainindex: X_list_flipLR, X_list_flipUD = self.flipData(X) X = X + X_list_flipLR y = y + y """ view_converter = DefaultViewConverter(shape=[48, 48, 1], axes=axes) super(EmotionsDataset, self).__init__(X=X, y=y, view_converter=view_converter) if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor)
def show(image): """ .. todo:: WRITEME Parameters ---------- image : PIL Image object or ndarray If ndarray, integer formats are assumed to use 0-255 and float formats are assumed to use 0-1 """ if hasattr(image, '__array__'): #do some shape checking because PIL just raises a tuple indexing error #that doesn't make it very clear what the problem is if len(image.shape) < 2 or len(image.shape) > 3: raise ValueError('image must have either 2 or 3 dimensions but its' ' shape is ' + str(image.shape)) if image.dtype == 'int8': image = np.cast['uint8'](image) elif str(image.dtype).startswith('float'): #don't use *=, we don't want to modify the input array image = image * 255. image = np.cast['uint8'](image) #PIL is too stupid to handle single-channel arrays if len(image.shape) == 3 and image.shape[2] == 1: image = image[:, :, 0] try: ensure_Image() image = Image.fromarray(image) except TypeError: raise TypeError("PIL issued TypeError on ndarray of shape " + str(image.shape) + " and dtype " + str(image.dtype)) # Create a temporary file with the suffix '.png'. fd, name = mkstemp(suffix='.png') os.close(fd) # Note: # Although we can use tempfile.NamedTemporaryFile() to create # a temporary file, the function should be used with care. # # In Python earlier than 2.7, a temporary file created by the # function will be deleted just after the file is closed. # We can re-use the name of the temporary file, but there is an # instant where a file with the name does not exist in the file # system before we re-use the name. This may cause a race # condition. # # In Python 2.7 or later, tempfile.NamedTemporaryFile() has # the 'delete' argument which can control whether a temporary # file will be automatically deleted or not. With the argument, # the above race condition can be avoided. # image.save(name) viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}') if os.name == 'nt': subprocess.Popen(viewer_command + ' ' + name + ' && del ' + name, shell=True) else: subprocess.Popen(viewer_command + ' ' + name + ' ; rm ' + name, shell=True)
def __init__(self, path='train.csv', task='classification', one_hot=False, expect_labels=True, expect_headers=True, delimiter=',', start=None, stop=None, start_fraction=None, end_fraction=None): """ .. todo:: WRITEME """ self.path = path self.task = task self.one_hot = one_hot self.expect_labels = expect_labels self.expect_headers = expect_headers self.delimiter = delimiter self.start = start self.stop = stop self.start_fraction = start_fraction self.end_fraction = end_fraction self.view_converter = None if task not in ['classification', 'regression']: raise ValueError('task must be either "classification" or ' '"regression"; got ' + str(task)) if start_fraction is not None: if end_fraction is not None: raise ValueError("Use start_fraction or end_fraction, " " not both.") if start_fraction <= 0: raise ValueError("start_fraction should be > 0") if start_fraction >= 1: raise ValueError("start_fraction should be < 1") if end_fraction is not None: if end_fraction <= 0: raise ValueError("end_fraction should be > 0") if end_fraction >= 1: raise ValueError("end_fraction should be < 1") if start is not None: if start_fraction is not None or end_fraction is not None: raise ValueError("Use start, start_fraction, or end_fraction," " just not together.") if stop is not None: if start_fraction is not None or end_fraction is not None: raise ValueError("Use stop, start_fraction, or end_fraction," " just not together.") # and go self.path = preprocess(self.path) X, y = self._load_data() super(CSVDataset, self).__init__(X=X, y=y)
or python lcn.py private_test to preprocess the ICML 2013 multimodal learning contest's private test images (which will be released 72 hours before the contest ends) """ if len(sys.argv) != 2: usage() print '(You used the wrong number of arguments)' quit(-1) _, arg = sys.argv if arg == 'public_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_images') outdir = base[:-6] + 'lcn' expected_num_images = 500 elif arg == 'private_test': base = preprocess( '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_images') outdir = base[:-6] + 'lcn' expected_num_images = 500 else: usage() print 'Unrecognized argument value:', arg print 'Recognized values are: public_test, private_test' serial.mkdir(outdir) paths = os.listdir(base)
if len(sys.argv) < 5: print 'Usage: analysis.py <path/to/voter_models> <path/to/valid_images> <path/to/test_images> <batch>' sys.exit(-1) models_path = sys.argv[1] # 'data/food100/output_resized_64/img_61_*.jpg' valid_path = sys.argv[2] test_path = sys.argv[3] batch = int(sys.argv[4]) class_to_id = get_classes() id_to_class = {v: k for k, v in class_to_id.items()} class_to_superclass = get_mapping() label_names_pkl_path = os.path.join(string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'food100', 'label_names.pkl') label_names_pkl = open(label_names_pkl_path, 'rb') label_names = pickle.load(label_names_pkl) class_to_label = {l : i for i, l in enumerate(label_names)} label_names_pkl.close() print 'Loading valid set from: %s' % valid_path valid_set = [img for img in glob(valid_path) if is_included(img, class_to_superclass, id_to_class)] voters = get_voters(models_path) confidence_matrix = np.zeros((len(voters), len(label_names)), dtype=float) for i, voter in enumerate(voters): cm = get_confusion_matrix(len(label_names), id_to_class, class_to_superclass, class_to_label, valid_set, voter, batch) print 'model#%d... misclass rate: %f' % (i, get_misclass(cm)) cm /= np.sum(cm, axis=0) confidence_matrix[i] = cm.diagonal()
def show(image): """ .. todo:: WRITEME Parameters ---------- image : PIL Image object or ndarray If ndarray, integer formats are assumed to use 0-255 and float formats are assumed to use 0-1 """ viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}') if viewer_command == 'inline': return imview(image) if hasattr(image, '__array__'): # do some shape checking because PIL just raises a tuple indexing error # that doesn't make it very clear what the problem is if len(image.shape) < 2 or len(image.shape) > 3: raise ValueError('image must have either 2 or 3 dimensions but its' ' shape is ' + str(image.shape)) # The below is a temporary workaround that prevents us from crashing # 3rd party image viewers such as eog by writing out overly large # images. # In the long run we should determine if this is a bug in PIL when # producing # such images or a bug in eog and determine a proper fix. # Since this is hopefully just a short term workaround the # constants below are not included in the interface to the # function, so that 3rd party code won't start passing them. max_height = 4096 max_width = 4096 # Display separate warnings for each direction, since it's # common to crop only one. if image.shape[0] > max_height: image = image[0:max_height, :, :] warnings.warn("Cropping image to smaller height to avoid crashing " "the viewer program.") if image.shape[0] > max_width: image = image[:, 0:max_width, :] warnings.warn("Cropping the image to a smaller width to avoid " "crashing the viewer program.") # This ends the workaround if image.dtype == 'int8': image = np.cast['uint8'](image) elif str(image.dtype).startswith('float'): # don't use *=, we don't want to modify the input array image = image * 255. image = np.cast['uint8'](image) # PIL is too stupid to handle single-channel arrays if len(image.shape) == 3 and image.shape[2] == 1: image = image[:, :, 0] try: ensure_Image() image = Image.fromarray(image) except TypeError: reraise_as(TypeError("PIL issued TypeError on ndarray of shape " + str(image.shape) + " and dtype " + str(image.dtype))) # Create a temporary file with the suffix '.png'. fd, name = mkstemp(suffix='.png') os.close(fd) # Note: # Although we can use tempfile.NamedTemporaryFile() to create # a temporary file, the function should be used with care. # # In Python earlier than 2.7, a temporary file created by the # function will be deleted just after the file is closed. # We can re-use the name of the temporary file, but there is an # instant where a file with the name does not exist in the file # system before we re-use the name. This may cause a race # condition. # # In Python 2.7 or later, tempfile.NamedTemporaryFile() has # the 'delete' argument which can control whether a temporary # file will be automatically deleted or not. With the argument, # the above race condition can be avoided. # image.save(name) if os.name == 'nt': subprocess.Popen(viewer_command + ' ' + name + ' && del ' + name, shell=True) else: subprocess.Popen(viewer_command + ' ' + name + ' ; rm ' + name, shell=True)
def main(): data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10') print('Loading STL-10 unlabeled and train datasets...') downsampled_dir = data_dir + '/stl10_32x32' data = serial.load(downsampled_dir + '/unlabeled.pkl') supplement = serial.load(downsampled_dir + '/train.pkl') print('Concatenating datasets...') data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0)) print("Preparing output directory...") output_dir = data_dir + '/stl10_32x32_whitened' serial.mkdir(output_dir) README = open(output_dir + '/README', 'w') README.write( textwrap.dedent(""" The .pkl files in this directory may be opened in python using cPickle, pickle, or pylearn2.serial.load. unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain a pylearn2 Dataset object defining an unlabeled dataset of a 32x32 approximately whitened version of the STL-10 dataset. unlabeled.pkl contains unlabeled train examples. train.pkl contains labeled train examples. unsupervised.pkl contains the union of these (without any labels). test.pkl contains the labeled test examples. preprocessor.pkl contains a pylearn2 ZCA object that was used to approximately whiten the images. You may want to use this object later to preprocess other images. They were created with the pylearn2 script make_stl10_whitened.py. All other files in this directory, including this README, were created by the same script and are necessary for the other files to function correctly. """)) README.close() print("Learning the preprocessor \ and preprocessing the unsupervised train data...") preprocessor = preprocessing.ZCA() data.apply_preprocessor(preprocessor=preprocessor, can_fit=True) print('Saving the unsupervised data') data.use_design_loc(output_dir + '/unsupervised.npy') serial.save(output_dir + '/unsupervised.pkl', data) X = data.X unlabeled = X[0:100 * 1000, :] labeled = X[100 * 1000:, :] del X print("Saving the unlabeled data") data.X = unlabeled data.use_design_loc(output_dir + '/unlabeled.npy') serial.save(output_dir + '/unlabeled.pkl', data) del data del unlabeled print("Saving the labeled train data") supplement.X = labeled supplement.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', supplement) del supplement del labeled print("Loading the test data") test = serial.load(downsampled_dir + '/test.pkl') print("Preprocessing the test data") test.apply_preprocessor(preprocessor=preprocessor, can_fit=False) print("Saving the test data") test.use_design_loc(output_dir + '/test.npy') serial.save(output_dir + '/test.pkl', test) serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def load(filepath, recurse_depth=0, retry=True): """ Parameters ---------- filepath : str A path to a file to load. Should be a pickle, Matlab, or NumPy file; or a .txt or .amat file that numpy.loadtxt can load. recurse_depth : int End users should not use this argument. It is used by the function itself to implement the `retry` option recursively. retry : bool If True, will make a handful of attempts to load the file before giving up. This can be useful if you are for example calling show_weights.py on a file that is actively being written to by a training script--sometimes the load attempt might fail if the training script writes at the same time show_weights tries to read, but if you try again after a few seconds you should be able to open the file. Returns ------- loaded_object : object The object that was stored in the file. ..todo Refactor to hide recurse_depth from end users """ try: import joblib joblib_available = True except ImportError: joblib_available = False if recurse_depth == 0: filepath = preprocess(filepath) if filepath.endswith('.npy') or filepath.endswith('.npz'): return np.load(filepath) if filepath.endswith('.amat') or filepath.endswith('txt'): try: return np.loadtxt(filepath) except Exception: logger.exception("{0} cannot be loaded by serial.load (trying to" " use np.loadtxt)".format(filepath)) raise if filepath.endswith('.mat'): global io if io is None: import scipy.io io = scipy.io try: return io.loadmat(filepath) except NotImplementedError, nei: if str(nei).find('HDF reader') != -1: global hdf_reader if hdf_reader is None: import h5py hdf_reader = h5py return hdf_reader.File(filepath) else: raise #this code should never be reached assert False