def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot=True, start=None, stop=None, axes=['b', 0, 1, 'c']): self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y if convert_to_one_hot: if not (self.y.min() == 0): raise AssertionError("Expected y.min == 0 but y.min == %g" % self.y.min()) nclass = self.y.max() + 1 y = np.zeros((self.y.shape[0], nclass), dtype='float32') for i in xrange(self.y.shape[0]): y[i, self.y[i]] = 1. self.y = y assert self.y is not None space, source = self.data_specs space.components[source.index('targets')].dim = nclass if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop-start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() if getattr(preprocessor, "inv_P_", None) is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting') self.view_converter.set_axes(axes)
def __setstate__(self, d): """ .. todo:: WRITEME """ if d['design_loc'] is not None: if control.get_load_data(): d['X'] = np.load(d['design_loc']) else: d['X'] = None if d['compress']: X = d['X'] mx = d['compress_max'] mn = d['compress_min'] del d['compress_max'] del d['compress_min'] d['X'] = 0 self.__dict__.update(d) if X is not None: self.X = np.cast['float32'](X) * mx / 255. + mn else: self.X = None else: self.__dict__.update(d) # To be able to unpickle older data after the addition of # the data_specs mechanism if not all(m in d for m in ('data_specs', 'X_space', '_iter_data_specs', 'X_topo_space')): X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: y_space = VectorSpace(dim=self.y.shape[-1]) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) view_converter = d.get('view_converter', None) if view_converter is not None: # Get the topo_space from the view_converter if not hasattr(view_converter, 'topo_space'): raise NotImplementedError("Not able to get a topo_space " "from this converter: %s" % view_converter) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = view_converter.topo_space
def from_dataset(dataset, num_examples): """ .. todo:: WRITEME """ try: V, y = dataset.get_batch_topo(num_examples, True) except: # This patches a case where control.get_load_data() is false so # dataset.X is None This logic should be removed whenever we implement # lazy loading if isinstance(dataset, DenseDesignMatrix) and dataset.X is None and not control.get_load_data(): warnings.warn("from_dataset wasn't able to make subset of " "dataset, using the whole thing") return DenseDesignMatrix(X=None, view_converter=dataset.view_converter) raise rval = DenseDesignMatrix(topo_view=V, y=y) rval.adjust_for_viewer = dataset.adjust_for_viewer return rval
def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot=True, start=None, stop=None, axes=['b', 0, 1, 'c']): self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y if convert_to_one_hot: if not (self.y.min() == 0): raise AssertionError("Expected y.min == 0 but y.min == %g" % self.y.min()) nclass = self.y.max() + 1 y = np.zeros((self.y.shape[0], nclass), dtype='float32') for i in xrange(self.y.shape[0]): y[i, self.y[i]] = 1. self.y = y assert self.y is not None space, source = self.data_specs space.components[source.index('targets')].dim = nclass if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() if preprocessor.inv_P_ is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting') self.view_converter.set_axes(axes)
def from_dataset(dataset, num_examples): """ .. todo:: WRITEME """ try: V, y = dataset.get_batch_topo(num_examples, True) except: # This patches a case where control.get_load_data() is false so # dataset.X is None This logic should be removed whenever we implement # lazy loading if isinstance(dataset, DenseDesignMatrix) and \ dataset.X is None and \ not control.get_load_data(): warnings.warn("from_dataset wasn't able to make subset of " "dataset, using the whole thing") return DenseDesignMatrix(X=None, view_converter=dataset.view_converter) raise rval = DenseDesignMatrix(topo_view=V, y=y) rval.adjust_for_viewer = dataset.adjust_for_viewer return rval
def __init__(self, preprocessed_dataset, preprocessor, start=None, stop=None, axes=None): if axes is not None: warnings.warn("The axes argument to ZCA_Dataset no longer has " "any effect. Its role is now carried out by the " "Space you pass to Dataset.iterator. You should " "remove 'axes' arguments from calls to " "ZCA_Dataset. This argument may be removed from " "the library after 2015-05-05.") self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y self.y_labels = preprocessed_dataset.y_labels # Defined up here because PEP8 requires excessive indenting if defined # where it is used. msg = ("Expected self.y to have dim 2, but it has %d. Maybe you are " "loading from an outdated pickle file?") if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: if self.y.ndim != 2: raise ValueError(msg % self.y.ndim) self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] if getattr(preprocessor, "inv_P_", None) is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting')
def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot=True, start=None, stop=None, axes=['b', 0, 1, 'c']): self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y if convert_to_one_hot: if not (self.y.min() == 0): raise AssertionError("Expected y.min == 0 but y.min == " + str(self.y.min())) nclass = self.y.max() + 1 y = np.zeros((self.y.shape[0], nclass), dtype='float32') for i in xrange(self.y.shape[0]): y[i, self.y[i]] = 1. self.y = y assert self.y is not None space, source = self.data_specs space.components[source.index('targets')].dim = nclass if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() print 'inverting...' preprocessor.invert() print '...done inverting' self.view_converter.axes = axes
def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot = True, start = None, stop = None, axes = ['b', 0, 1, 'c']): self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y if convert_to_one_hot: if not ( self.y.min() == 0): raise AssertionError("Expected y.min == 0 but y.min == "+str(self.y.min())) nclass = self.y.max() + 1 y = np.zeros((self.y.shape[0], nclass), dtype='float32') for i in xrange(self.y.shape[0]): y[i,self.y[i]] = 1. self.y = y assert self.y is not None space, source = self.data_specs space.components[source.index('targets')].dim = nclass if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop,:] if self.y is not None: self.y = self.y[start:stop,:] assert self.X.shape[0] == stop-start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() print 'inverting...' preprocessor.invert() print '...done inverting' self.view_converter.axes = axes
def __init__(self, preprocessed_dataset, preprocessor, start=None, stop=None, axes=['b', 0, 1, 'c']): """ .. todo:: WRITEME """ self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y self.y_labels = preprocessed_dataset.y_labels if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] # self.mn = self.X.min() # self.mx = self.X.max() if getattr(preprocessor, "inv_P_", None) is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting') self.view_converter.set_axes(axes)
def __init__(self, which_set='train', center=False, start=None, stop=None, axes=['b', 'c', 0, 1], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.shape = (8, 35, 57) self.size = {'train': 2849, 'valid': 2849, 'test': 2849} self.range = (-10, 10) self.path = "${PYLEARN2_DATA_PATH}/ecmwf/" self.set_path = {'train': 'ecmwf.train', 'valid': 'ecmwf.val', 'test': 'ecmwf.test'} self.args = locals() if which_set not in ['train', 'valid', 'test']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","valid","test"].') path = self.path + self.set_path[which_set] if control.get_load_data(): path = serial.preprocess(path) datasetCache = cache.datasetCache path = datasetCache.cache_file(path) X, topo_view, y = self._read_ecmwf(path, which_set) else: X = np.random.rand(self.size[which_set], np.prod(self.shape)) topo_view = np.random.rand(self.size[which_set]*np.prod(self.shape)) y = np.random.randint(self.range[0], self.range[1], (self.size[which_set], 1)) (m, v, r, c) = topo_view.shape if center: topo_view -= topo_view.mean(axis=0) super(ECMWF, self).__init__(X=X, topo_view=topo_view, y=y, axes=axes) assert not np.any(np.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center = False, one_hot = False, binarize = False, axes=['b', 0, 1, 'c'], preprocessor = ZCA(), fit_preprocessor = False, fit_test_preprocessor = False): self.args = locals() print "==========IIII LOVVEEEE YOOOOOUU========" def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = os.environ['PYLEARN2_DATA_PATH'] + '/faceEmo/' if which_set == 'train': X = np.load(path + 'train_X.npy').astype('float32') y = np.load(path + 'train_y.npy').astype('float32') else: # import pdb # pdb.set_trace() assert which_set == 'test' X = np.load(path + 'test_X.npy').astype('float32') y = np.load(path + 'test_y.npy').astype('float32') if binarize: X = (X > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = np.zeros((y.shape[0],3),dtype='float32') for i in xrange(y.shape[0]): one_hot[i,y[i]] = 1. y = one_hot if center: X -= X.mean(axis=0) super(FaceEmo,self).__init__(X = X, y = y) if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.fit(self.X) preprocessor.apply(self, fit_preprocessor)
def __init__(self, preprocessed_dataset, preprocessor, start=None, stop=None, axes=None): if axes is not None: warnings.warn("The axes argument to ZCA_Dataset no longer has " "any effect. Its role is now carried out by the " "Space you pass to Dataset.iterator. You should " "remove 'axes' arguments from calls to " "ZCA_Dataset. This argument may be removed from " "the library after 2015-05-05.") self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y self.y_labels = preprocessed_dataset.y_labels # Defined up here because PEP8 requires excessive indenting if defined # where it is used. msg = ("Expected self.y to have dim 2, but it has %d. Maybe you are " "loading from an outdated pickle file?") if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: if self.y.ndim != 2: raise ValueError(msg % self.y.ndim) self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0]
def from_dataset(dataset, num_examples): # This function does not support tags attribute try: V, y = dataset.get_batch_topo(num_examples, True) except: if isinstance(dataset, DenseDesignMatrix) and dataset.X is None and not control.get_load_data(): warnings.warn("from_dataset wasn't able to make subset of dataset, using the whole thing") return DenseDesignMatrix(X = None, view_converter = dataset.view_converter) #This patches a case where control.get_load_data() is false so dataset.X is None #This logic should be removed whenever we implement lazy loading raise rval = DenseDesignMatrix(topo_view=V, y=y) rval.adjust_for_viewer = dataset.adjust_for_viewer return rval
def _load_path(self, which_set, which_targets, word2vec_dict={}): if which_targets not in ['fine', 'coarse']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["fine","coarse"].') if which_set not in ['train', 'test']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/TREC_question_type_data/" if which_set == 'train': data_path = path + 'trecqc.train_5500.label.txt' else: assert which_set == 'test' data_path = path + 'trecqc.test_500.label.txt' data_path = serial.preprocess(data_path) self.path = path return data_path
def __setstate__(self, d): if d["design_loc"] is not None: if control.get_load_data(): d["X"] = N.load(d["design_loc"]) else: d["X"] = None if d["compress"]: X = d["X"] mx = d["compress_max"] mn = d["compress_min"] del d["compress_max"] del d["compress_min"] d["X"] = 0 self.__dict__.update(d) if X is not None: self.X = N.cast["float32"](X) * mx / 255.0 + mn else: self.X = None else: self.__dict__.update(d)
def __setstate__(self, d): if d['design_loc'] is not None: if control.get_load_data(): d['X'] = N.load(d['design_loc']) else: d['X'] = None if d['compress']: X = d['X'] mx = d['compress_max'] mn = d['compress_min'] del d['compress_max'] del d['compress_min'] d['X'] = 0 self.__dict__.update(d) if X is not None: self.X = N.cast['float32'](X) * mx / 255. + mn else: self.X = None else: self.__dict__.update(d)
def __init__(self, which_set, start=None, stop=None): self.args = locals() if which_set not in ['train', 'test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') size = TEST_SIZE if which_set == 'test' else TRAIN_SIZE if control.get_load_data(): topo_view, y = read_images(IM_GLOB, size, TEST_TRAIN_RANDOM_SEED, False if which_set == 'train' else True) else: topo_view = np.random.rand(size, 32, 32) y = np.random.randint(0, 10, (size, 1)) super(PlanktonDDM, self).__init__(topo_view=topo_view, y=y, axes=['b', 0, 1, 'c'], y_labels=NUM_CLASSES) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start
def __init__( self, which_set, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None, axes=["b", 0, 1, "c"], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False, ): self.args = locals() if which_set not in ["train", "test"]: if which_set == "valid": raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature." ) raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].' ) def dimshuffle(b01c): default = ("b", 0, 1, "c") return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${VIDTIMIT}/data/" if which_set == "train": im_path = path + "train.npy" label_path = path + "train-labels.npy" else: assert which_set == "test" im_path = path + "test.npy" label_path = path + "test-labels.npy" # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) topo_view = np.load(im_path) y = np.load(label_path) if binarize: topo_view = (topo_view > 0.5).astype("float32") self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 36), dtype="float32") for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1.0 y = one_hot max_labels = None else: max_labels = 36 m, r, c = topo_view.shape assert r == 32 assert c == 32 topo_view = topo_view.reshape(m, r, c, 1) if which_set == "train": assert m == 27280 elif which_set == "test": assert m == 10929 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for one_hot=True/False tmp = y[i : i + 1].copy() y[i] = y[j] y[j] = tmp super(VidTIMIT, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, max_labels=max_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError("stop=" + str(stop) + ">" + "m=" + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the # right topology topo = dimshuffle(np.zeros((1, 32, 32, 1))) super(VidTIMIT, self).__init__(topo_view=topo, axes=axes) self.X = None if which_set == "test": assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None): self.args = locals() if which_set not in ["train", "test"]: if which_set == "valid": raise ValueError( "There is no such thing as the MNIST " "validation set. MNIST consists of 60,000 train examples and 10,000 test" " examples. If you wish to use a validation set you should divide the train " "set yourself. The pylearn2 dataset implements and will only ever implement " "the standard train / test split used in the literature." ) raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].' ) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == "train": im_path = path + "train-images-idx3-ubyte" label_path = path + "train-labels-idx1-ubyte" else: assert which_set == "test" im_path = path + "t10k-images-idx3-ubyte" label_path = path + "t10k-labels-idx1-ubyte" # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) topo_view = read_mnist_images(im_path, dtype="float32") y = read_mnist_labels(label_path) if binarize: topo_view = (topo_view > 0.5).astype("float32") self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 10), dtype="float32") for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1.0 y = one_hot m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == "train": assert m == 60000 elif which_set == "test": assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1, 2, 3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i : i + 1].copy() y[i] = y[j] y[j] = tmp view_converter = dense_design_matrix.DefaultViewConverter((28, 28, 1)) super(MNIST, self).__init__(topo_view=topo_view, y=y) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError("stop=" + str(stop) + ">" + "m=" + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the right topology topo = np.zeros((1, 28, 28, 1)) super(MNIST, self).__init__(topo_view=topo) self.X = None
def __init__(self, which_set, center=False, shuffle=False, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature.") raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') def dimshuffle(b01c): """ .. todo:: WRITEME """ default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/sign24/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) # Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) label_path = datasetCache.cache_file(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = np.atleast_2d(read_mnist_labels(label_path)).T else: if which_set == 'train': size = 15 elif which_set == 'test': size = 5 else: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') topo_view = np.random.rand(size, 28, 28) y = np.random.randint(0, 10, (size, 1)) if binarize: topo_view = (topo_view > 0.5).astype('float32') y_labels = 24 m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 3576 elif which_set == 'test': assert m == 1176 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng( None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, y_labels=y_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center=False, shuffle=False, one_hot=None, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature.") raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') def dimshuffle(b01c): """ .. todo:: WRITEME """ default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) # Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) label_path = datasetCache.cache_file(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = np.atleast_2d(read_mnist_labels(label_path)).T else: if which_set == 'train': size = 60000 elif which_set == 'test': size = 10000 else: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') topo_view = np.random.rand(size, 28, 28) y = np.random.randint(0, 10, (size, 1)) if binarize: topo_view = (topo_view > 0.5).astype('float32') max_labels = 10 if one_hot is not None: warnings.warn("the `one_hot` parameter is deprecated. To get " "one-hot encoded targets, request that they " "live in `VectorSpace` through the `data_specs` " "parameter of MNIST's iterator method. " "`one_hot` will be removed on or after " "September 20, 2014.", stacklevel=2) m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for one_hot=True/False tmp = y[i:i+1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, y_labels=max_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, start=None, stop=None, axes=['b', 0, 1, 'c']): self.args = locals() def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "/local/Pylearn2/data/ADNI/" im_path = path + 'ADNI_X_down' label_path = path + 'ADNI_y_down' #im_path = serial.preprocess(im_path) #label_path = serial.preprocess(label_path) #Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) label_path = datasetCache.cache_file(label_path) with open_if_filename(im_path, 'rb') as f: #magic, number, rows, cols = struct.unpack('>iiii', f.read(16)) #im_array = N.fromfile(f, dtype='float64') im_array = pickle.load(f) with open_if_filename(label_path, 'rb') as f: #label_array = N.fromfile(f, dtype='uint8') label_array = pickle.load(f) topo_view = im_array y = label_array else: if which_set == 'train': size = 397 else: size = 50 topo_view = np.random.rand(size, 64, 64, 41) y = np.random.randint(0, 3, (size, 1)) y_labels = 3 m, r, c, d = topo_view.shape assert r == 64 assert c == 64 topo_view = topo_view.reshape(m, r, c, 41) if which_set == 'train': assert m == 397 else: assert m == 50 super(MRI, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, y_labels=y_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 #if stop > self.X.shape[0]: #raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape=[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start
def __init__(self, which_set, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = (topo_view > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 10), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. y = one_hot max_labels = None else: max_labels = 10 m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for one_hot=True/False tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, max_labels=max_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the # right topology topo = dimshuffle(np.zeros((1, 28, 28, 1))) super(MNIST, self).__init__(topo_view=topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot=None, start=None, stop=None, axes=['b', 0, 1, 'c']): """ .. todo:: WRITEME """ self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y self.y_labels = preprocessed_dataset.y_labels if convert_to_one_hot is not None: warnings.warn( "the `convert_to_one_hot` parameter is deprecated. To get " "one-hot encoded targets, request that they " "live in `VectorSpace` through the `data_specs` " "parameter of dataset iterator method. " "`convert_to_one_hot` will be removed on or after " "September 20, 2014.", stacklevel=2) if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop - start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() if getattr(preprocessor, "inv_P_", None) is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting') self.view_converter.set_axes(axes)
def __init__(self, which_set, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST " "validation set. MNIST consists of 60,000 train examples and 10,000 test" " examples. If you wish to use a validation set you should divide the train " "set yourself. The pylearn2 dataset implements and will only ever implement " "the standard train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = (topo_view > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 10), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. y = one_hot m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1, 2, 3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp view_converter = dense_design_matrix.DefaultViewConverter( (28, 28, 1)) super(MNIST, self).__init__(topo_view=topo_view, y=y) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: #data loading is disabled, just make something that defines the right topology topo = np.zeros((1, 28, 28, 1)) super(MNIST, self).__init__(topo_view=topo) self.X = None
def __setstate__(self, d): if d['design_loc'] is not None: if control.get_load_data(): d['X'] = N.load(d['design_loc']) else: d['X'] = None if d['compress']: X = d['X'] mx = d['compress_max'] mn = d['compress_min'] del d['compress_max'] del d['compress_min'] d['X'] = 0 self.__dict__.update(d) if X is not None: self.X = N.cast['float32'](X) * mx / 255. + mn else: self.X = None else: self.__dict__.update(d) # To be able to unpickle older data after the addition of # the data_specs mechanism if not all(m in d for m in ('data_specs', 'X_space', '_iter_data_specs', 'X_topo_space')): X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: y_space = VectorSpace(dim=self.y.shape[-1]) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) view_converter = d.get('view_converter', None) if view_converter is not None: # Build a Conv2DSpace from the view_converter if not (isinstance(view_converter, DefaultViewConverter) and len(view_converter.shape) == 3): raise NotImplementedError( "Not able to build a Conv2DSpace " "corresponding to this converter: %s" % view_converter) axes = view_converter.axes rows, cols, channels = view_converter.shape # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = Conv2DSpace( shape=(rows, cols), num_channels=channels, axes=axes)
def __init__(self, which_set, shuffle=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'valid', 'test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ' + '["train", "valid", "test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/binarized_mnist/binarized_mnist_" + \ which_set + ".npy" im_path = serial.preprocess(path) # Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) try: X = serial.load(im_path) except IOError: raise NotInstalledError("BinarizedMNIST data files cannot be " "found in ${PYLEARN2_DATA_PATH}. Run " "pylearn2/scripts/datasets/" "download_binarized_mnist.py to get " "the data") else: if which_set == 'train': size = 50000 else: size = 10000 X = numpy.random.binomial(n=1, p=0.5, size=(size, 28 ** 2)) m, d = X.shape assert d == 28 ** 2 if which_set == 'train': assert m == 50000 else: assert m == 10000 if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(X.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = X[i, :].copy() X[i, :] = X[j, :] X[j, :] = tmp super(BinarizedMNIST, self).__init__( X=X, view_converter=DefaultViewConverter(shape=(28, 28, 1)) ) assert not numpy.any(numpy.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center = False, shuffle = False, one_hot = False, binarize = False): if which_set not in ['train','test']: if which_set == 'valid': raise ValueError("There is no such thing as the MNIST " "validation set. MNIST consists of 60,000 train examples and 10,000 test" " examples. If you wish to use a validation set you should divide the train " "set yourself. The pylearn2 dataset implements and will only ever implement " "the standard train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set,)+'". Valid values are ["train","test"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = ( topo_view > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0],10),dtype='float32') for i in xrange(y.shape[0]): one_hot[i,y[i]] = 1. y = one_hot m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m,r,c,1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1,2,3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) tmp = topo_view[i,:,:,:] topo_view[i,:,:,:] = topo_view[j,:,:,:] topo_view[j,:,:,:] = tmp tmp = y[i] y[i] = y[j] y[j] = tmp view_converter = dense_design_matrix.DefaultViewConverter((28,28,1)) super(MNIST,self).__init__(topo_view = topo_view , y = y) assert not N.any(N.isnan(self.X)) else: #data loading is disabled, just make something that defines the right topology topo = np.zeros((1,28,28,1)) super(MNIST,self).__init__(topo_view = topo) self.X = None
def __init__(self, preprocessed_dataset, preprocessor, convert_to_one_hot=None, start=None, stop=None, axes=['b', 0, 1, 'c']): """ .. todo:: WRITEME """ self.args = locals() self.preprocessed_dataset = preprocessed_dataset self.preprocessor = preprocessor self.rng = self.preprocessed_dataset.rng self.data_specs = preprocessed_dataset.data_specs self.X_space = preprocessed_dataset.X_space self.X_topo_space = preprocessed_dataset.X_topo_space self.view_converter = preprocessed_dataset.view_converter self.y = preprocessed_dataset.y self.y_labels = preprocessed_dataset.y_labels if convert_to_one_hot is not None: warnings.warn("the `convert_to_one_hot` parameter is deprecated. To get " "one-hot encoded targets, request that they " "live in `VectorSpace` through the `data_specs` " "parameter of dataset iterator method. " "`convert_to_one_hot` will be removed on or after " "September 20, 2014.", stacklevel=2) if control.get_load_data(): if start is not None: self.X = preprocessed_dataset.X[start:stop, :] if self.y is not None: self.y = self.y[start:stop, :] assert self.X.shape[0] == stop-start else: self.X = preprocessed_dataset.X else: self.X = None if self.X is not None: if self.y is not None: assert self.y.shape[0] == self.X.shape[0] #self.mn = self.X.min() #self.mx = self.X.max() if getattr(preprocessor, "inv_P_", None) is None: warnings.warn("ZCA preprocessor.inv_P_ was none. Computing " "inverse of preprocessor.P_ now. This will take " "some time. For efficiency, it is recommended that " "in the future you compute the inverse in ZCA.fit() " "instead, by passing it compute_inverse=True.") logger.info('inverting...') preprocessor.inv_P_ = np.linalg.inv(preprocessor.P_) logger.info('...done inverting') self.view_converter.set_axes(axes)
def __setstate__(self, d): if d['design_loc'] is not None: if control.get_load_data(): d['X'] = N.load(d['design_loc']) else: d['X'] = None if d['compress']: X = d['X'] mx = d['compress_max'] mn = d['compress_min'] del d['compress_max'] del d['compress_min'] d['X'] = 0 self.__dict__.update(d) if X is not None: self.X = N.cast['float32'](X) * mx / 255. + mn else: self.X = None else: self.__dict__.update(d) # To be able to unpickle older data after the addition of # the data_specs mechanism if not all(m in d for m in ('data_specs', 'X_space', '_iter_data_specs', 'X_topo_space')): X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: y_space = VectorSpace(dim=self.y.shape[-1]) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) view_converter = d.get('view_converter', None) if view_converter is not None: # Build a Conv2DSpace from the view_converter if not (isinstance(view_converter, DefaultViewConverter) and len(view_converter.shape) == 3): raise NotImplementedError( "Not able to build a Conv2DSpace " "corresponding to this converter: %s" % view_converter) axes = view_converter.axes rows, cols, channels = view_converter.shape # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = Conv2DSpace(shape=(rows, cols), num_channels=channels, axes=axes)
def __init__(self, which_set, imgd=65, zd=1, ds=1, center = False, shuffle = False, one_hot = False, binarize = False, start = None, stop = None, axes=['b', 0, 1, 'c'], preprocessor = None, fit_preprocessor = False, fit_test_preprocessor = False): self.args = locals() if which_set not in ['train','valid','test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set,)+'". Valid values are ["train","valid",test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/lgn/" path = path + "LGN1_MembraneSamples_65x65x1_mp0.50_train50000_valid10000_test10000_seed11.pkl.gz" path = serial.preprocess(path) f = gzip.open(path, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() if which_set == 'train': data = train_set elif which_set == 'valid': data = valid_set else: data = test_set input_shape = (imgd, imgd, zd) # f = h5py.file(path, 'r') # input_shape = f['input_shape'][...] # if which_set == 'train': # data = f['/train_set'][...] # elif which_set == 'valid': # data = f['/valid_set'][...] # else: # data = f['/test_set'][...] # Convert images to float 0-1 topo_view = data[0].astype(np.float32) / 255.0 y = data[1] self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0],2),dtype='float32') for i in xrange(y.shape[0]): one_hot[i,y[i]] = 1. y = one_hot m = topo_view.shape[0] rows, cols, slices = input_shape topo_view = topo_view.reshape(m, rows, cols, slices) if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1,2,3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i,:,:,:].copy() topo_view[i,:,:,:] = topo_view[j,:,:,:] topo_view[j,:,:,:] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i+1].copy() y[i] = y[j] y[j] = tmp super(LGN,self).__init__(topo_view = dimshuffle(topo_view), y = y, axes=axes) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop='+str(stop)+'>'+'m='+str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop,:] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop,:] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: #data loading is disabled, just make something that defines the right topology topo = dimshuffle(np.zeros((1,65,65,1))) super(LGN,self).__init__(topo_view = topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, imgd=49, zd=1, ds=1, center = False, shuffle = False, one_hot = False, binarize = False, start = None, stop = None, axes=['b', 0, 1, 'c'], preprocessor = None, fit_preprocessor = False, fit_test_preprocessor = False): self.args = locals() if which_set not in ['train','valid','test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set,)+'". Valid values are ["train","valid",test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/ISBI_2012/" path = path + "ISBI_MembraneSamples_49x49_mp0.50_train50000_valid5000_test5000_seed11_ds2b.pkl.gz" path = serial.preprocess(path) f = gzip.open(path, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() if which_set == 'train': data = train_set elif which_set == 'valid': data = valid_set else: data = test_set input_shape = (imgd, imgd, zd) # f = h5py.file(path, 'r') # input_shape = f['input_shape'][...] # if which_set == 'train': # data = f['/train_set'][...] # elif which_set == 'valid': # data = f['/valid_set'][...] # else: # data = f['/test_set'][...] # Convert images to float 0-1 topo_view = data[0].astype(np.float32) / 255.0 y = data[1] self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0],2),dtype='float32') for i in xrange(y.shape[0]): one_hot[i,y[i]] = 1. y = one_hot m = topo_view.shape[0] rows, cols, slices = input_shape topo_view = topo_view.reshape(m, rows, cols, slices) if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1,2,3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i,:,:,:].copy() topo_view[i,:,:,:] = topo_view[j,:,:,:] topo_view[j,:,:,:] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i+1].copy() y[i] = y[j] y[j] = tmp super(ISBI,self).__init__(topo_view = dimshuffle(topo_view), y = y, axes=axes) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop='+str(stop)+'>'+'m='+str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop,:] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop,:] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: #data loading is disabled, just make something that defines the right topology topo = dimshuffle(np.zeros((1,49,49,1))) super(ISBI,self).__init__(topo_view = topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, start=None, stop=None): self.args = locals() if which_set not in ['train','valid', 'test']: if which_set == 'valid': raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/milleniumSAMs/" if which_set == 'train': data_path = path + 'milliTrain.pickle.gz' elif which_set == 'valid': data_path = path + 'milliValid.pickle.gz' else: assert which_set == 'test' data_path = path + 'milliTest.pickle.gz' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). data_path = serial.preprocess(data_path) # Locally cache the files before reading them #Not sure if it's necessary, but why not? datasetCache = cache.datasetCache data_path = datasetCache.cache_file(data_path) X, y = pickle.load(gzip.open(data_path)) else: #I don't know when this would be called, or why? #It should generate random data of the same dimensions, but I'm not gonna bother doing that. #This is the old code for the MNIST images if which_set == 'train': size = 60000 elif which_set == 'test': size = 10000 else: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') topo_view = np.random.rand(size, 28, 28) y = np.random.randint(0, 10, (size, 1)) m, r = X.shape assert r == 193 #Shuffle used to be here, which I don't think is terrifically necessary #X=dimshuffle(X) super(MILLI_SAM, self).__init__(X=X, y=y) assert not np.any(np.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start
def __init__(self, which_set, center = False, shuffle = False, one_hot = False, binarize = False, start = None, stop = None, axes=['b', 0, 1, 'c'], preprocessor = None, fit_preprocessor = False, fit_test_preprocessor = False): self.args = locals() if which_set not in ['train','test']: if which_set == 'valid': raise ValueError("There is no such thing as the MNIST " "validation set. MNIST consists of 60,000 train examples and 10,000 test" " examples. If you wish to use a validation set you should divide the train " "set yourself. The pylearn2 dataset implements and will only ever implement " "the standard train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set,)+'". Valid values are ["train","test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = ( topo_view > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0],10),dtype='float32') for i in xrange(y.shape[0]): one_hot[i,y[i]] = 1. y = one_hot m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m,r,c,1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1,2,3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i,:,:,:].copy() topo_view[i,:,:,:] = topo_view[j,:,:,:] topo_view[j,:,:,:] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i+1].copy() y[i] = y[j] y[j] = tmp super(MNIST,self).__init__(topo_view = dimshuffle(topo_view), y = y, axes=axes) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop='+str(stop)+'>'+'m='+str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop,:] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop,:] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the # right topology topo = dimshuffle(np.zeros((1,28,28,1))) super(MNIST,self).__init__(topo_view = topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, start=None, stop=None, nParticles = None): self.args = locals() if which_set not in ['train','valid', 'test']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test", "valid"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/nanoParticle/" #have to load in the whole array from the start anyway import h5py import numpy as np totParticles = 262144 if nParticles is None: nParticles = totParticles trainingFrac = .8 validFrac = .1 np.random.seed(0) idxs = np.random.choice(totParticles, nParticles) #if which_set == 'train': slice = np.s_[idxs, :] #elif which_set == 'valid': # slice = np.s_[int(totParticles*trainingFrac):int(totParticles*trainingFrac)+nParticles, :] #else: # assert which_set == 'test' # slice = np.s_[int(totParticles*(trainingFrac+validFrac)):int(totParticles*(trainingFrac+validFrac))+nParticles, :] X = np.zeros((100, nParticles*6)) y = np.zeros((100, nParticles*3)) colors = ['b', 'g','r','y','k','m'] def getFilename(i): base = path+'snapshot_' if i<10: out= base+'00%d.hdf5'%i elif i<100: out= base+'0%d.hdf5'%i else: out= base+'%d.hdf5'%i return serial.preprocess(out) absMinVel, absMaxVal = 0,0 maxCoord= 10000 #particles in a 0-10000 cube for i in xrange(101): fname = getFilename(i) f = h5py.File(fname, 'r') ids = f['PartType1']['ParticleIDs'][()] sorter = ids.argsort() coords = f['PartType1']['Coordinates'][()] coords = coords[sorter]#sort by ids #normalize #coordinates are all >=0, so just divide by max coords/=maxCoord #from matplotlib import pyplot as plt #plt.scatter(coords[0, 0], coords[0,1], c = colors[i%len(colors)]) coords = coords[slice] #if i == 100: # print which_set # plt.show() if i!=0: y[i-1,:] = coords.flatten() #if i == 100: # continue #y[i,:] = coords.flatten() if i!=100: vels = f['PartType1']['Velocities'][()] vels = vels[sorter] minVel, maxVel = vels.min(), vels.max() if minVel < absMinVel: absMinVel = minVel if maxVel > absMaxVal: absMaxVal = maxVel vels = vels[slice] data = np.concatenate((coords, vels), axis = 1).flatten() X[i,:] = data del data del coords f.close() #normalize the velocity columns for n in xrange(nParticles): for col in xrange(3): X[:, n*6+3+col] = (X[:, n*6+3+col]-absMinVel)/(absMaxVal-absMinVel) else: #I don't know when this would be called, or why? #It should generate random data of the same dimensions, but I'm not gonna bother doing that. #This is the old code for the MNIST images if which_set == 'train': size = 60000 elif which_set == 'test': size = 10000 else: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') topo_view = np.random.rand(size, 28, 28) y = np.random.randint(0, 10, (size, 1)) m, r = X.shape assert m == 100 n, s = y.shape assert n == 100 #Shuffle used to be here, which I don't think is terrifically necessary #X=dimshuffle(X) super(NANO_PARTICLE, self).__init__(X=X, y=y) del X del y assert not np.any(np.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start