def __init__(self, filename, X=None, topo_view=None, y=None, load_all=False, **kwargs): if 'preprocessor' in kwargs: if ('fit_preprocessor' in kwargs and kwargs['fit_preprocessor'] is False) or ('fit_preprocessor' not in kwargs): self._preprocessor = kwargs['preprocessor'] kwargs['preprocessor'] = None else: self._preprocessor = None self.load_all = load_all if h5py is None: raise RuntimeError("Could not import h5py.") self._file = h5py.File(filename) if X is not None: X = self.get_dataset(X, load_all) if topo_view is not None: topo_view = self.get_dataset(topo_view, load_all) if y is not None: y = self.get_dataset(y, load_all) DenseDesignMatrix.__init__(self, X=X, topo_view=topo_view, y=y, **kwargs)
def test_multiple_monitoring_datasets(): # tests that DefaultTrainingAlgorithm can take multiple # monitoring datasets. BATCH_SIZE = 2 BATCHES = 3 NUM_FEATURES = 4 dim = 3 m = 10 rng = np.random.RandomState([2014, 02, 25]) X = rng.randn(m, dim) Y = rng.randn(m, dim) train = DenseDesignMatrix(X=X) test = DenseDesignMatrix(X=Y) algorithm = DefaultTrainingAlgorithm( batch_size=BATCH_SIZE, batches_per_iter=BATCHES, monitoring_dataset={'train': train, 'test': test}) model = S3C(nvis=NUM_FEATURES, nhid=1, irange=.01, init_bias_hid=0., init_B=1., min_B=1., max_B=1., init_alpha=1., min_alpha=1., max_alpha=1., init_mu=0., m_step=Grad_M_Step(learning_rate=0.), e_step=E_Step(h_new_coeff_schedule=[1.])) algorithm.setup(model=model, dataset=train) algorithm.train(dataset=train)
def test_convert_to_one_hot(): rng = np.random.RandomState([2013, 11, 14]) m = 11 d = DenseDesignMatrix( X=rng.randn(m, 4), y=rng.randint(low=0, high=10, size=(m,))) d.convert_to_one_hot()
def get_feats_from_cnn(rows, model=None): """ fprop rows using best trained model and returns activations of the penultimate layer """ conf = utils.get_config() patch_size = conf['patch_size'] region_size = conf['region_size'] batch_size = None preds = utils.get_predictor(model=model, return_all=True) y = np.zeros(len(rows)) samples = np.zeros( (len(rows), region_size, region_size, 1), dtype=np.float32) for i, row in enumerate(rows): print 'processing %i-th image: %s' % (i, row['image_filename']) try: samples[i] = utils.get_samples_from_image(row, False)[0] except ValueError as e: print '{1} Value error: {0}'.format(str(e), row['image_filename']) y[i] = utils.is_positive(row) ds = DenseDesignMatrix(topo_view=samples) pipeline = utils.get_pipeline( ds.X_topo_space.shape, patch_size, batch_size) pipeline.apply(ds) return preds[-2](ds.get_topological_view()), y
def get_feats_from_cnn(rows, model=None): """ fprop rows using best trained model and returns activations of the penultimate layer """ conf = utils.get_config() patch_size = conf['patch_size'] region_size = conf['region_size'] batch_size = None preds = utils.get_predictor(model=model, return_all=True) y = np.zeros(len(rows)) samples = np.zeros((len(rows), region_size, region_size, 1), dtype=np.float32) for i, row in enumerate(rows): print 'processing %i-th image: %s' % (i, row['image_filename']) try: samples[i] = utils.get_samples_from_image(row, False)[0] except ValueError as e: print '{1} Value error: {0}'.format(str(e), row['image_filename']) y[i] = utils.is_positive(row) ds = DenseDesignMatrix(topo_view=samples) pipeline = utils.get_pipeline(ds.X_topo_space.shape, patch_size, batch_size) pipeline.apply(ds) return preds[-2](ds.get_topological_view()), y
def test_unit_norm(self): """ Test that using std_bias = 0.0 and use_norm = True results in vectors having unit norm """ tol = 1e-5 num_examples = 5 num_features = 10 rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(num_examples, num_features)) dataset = DenseDesignMatrix(X=X) # the setting of subtract_mean is not relevant to the test # the test only applies when std_bias = 0.0 and use_std = False preprocessor = GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=False) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() norms = np.sqrt(np.square(result).sum(axis=1)) max_norm_error = np.abs(norms - 1.).max() tol = 3e-5 assert max_norm_error < tol
def test_from_dataset(): """ Tests whether it supports integer labels. """ rng = np.random.RandomState([1, 2, 3]) topo_view = rng.randn(12, 2, 3, 3) y = rng.randint(0, 5, (12, 1)) # without y: d1 = DenseDesignMatrix(topo_view=topo_view) slice_d = from_dataset(d1, 5) assert slice_d.X.shape[1] == d1.X.shape[1] assert slice_d.X.shape[0] == 5 # with y: d2 = DenseDesignMatrix(topo_view=topo_view, y=y) slice_d = from_dataset(d2, 5) assert slice_d.X.shape[1] == d2.X.shape[1] assert slice_d.X.shape[0] == 5 assert slice_d.y.shape[0] == 5 # without topo_view: x = topo_view.reshape(12, 18) d3 = DenseDesignMatrix(X=x, y=y) slice_d = from_dataset(d3, 5) assert slice_d.X.shape[1] == d3.X.shape[1] assert slice_d.X.shape[0] == 5 assert slice_d.y.shape[0] == 5
def get_matrices(self, n=3, ratios=[.7, .15, .15]): X = [] y = [] for ngr, w in self.iterate_ngram_training(n): X.append(ngr) y.append([w]) X = numpy.array(X) y = numpy.array(y) total = len(y) training = round(total * ratios[0]) valid = training + round(total * ratios[1]) #test = total - training - valid labels = len(self.vocab) training_data = DenseDesignMatrix(X=X[:training, :], y=y[:training], X_labels=labels, y_labels=labels) valid_data = DenseDesignMatrix(X=X[training:valid, :], y=y[training:valid], X_labels=labels, y_labels=labels) test_data = DenseDesignMatrix(X=X[valid:, :], y=y[valid:], X_labels=labels, y_labels=labels) return training_data, valid_data, test_data
def test_unit_norm(self): """ Test that using std_bias = 0.0 and use_norm = True results in vectors having unit norm """ tol = 1e-5 num_examples = 5 num_features = 10 rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(num_examples, num_features)) dataset = DenseDesignMatrix(X=X) # the setting of subtract_mean is not relevant to the test # the test only applies when std_bias = 0.0 and use_std = False preprocessor = GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=False) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() norms = np.sqrt(np.square(result).sum(axis=1)) max_norm_error = np.abs(norms - 1.).max() tol = 3e-5 assert max_norm_error < tol
def test_convert_to_one_hot(): rng = np.random.RandomState([2013, 11, 14]) m = 11 d = DenseDesignMatrix( X=rng.randn(m, 4), y=rng.randint(low=0, high=10, size=(m,))) d.convert_to_one_hot()
def create_batch_matrices(self, ratios=[.7, .15, .15]): res = self.read_batch() if res is None: return None X, y = res num_labels = len(self.needed) + 1 # for filtered words X = numpy.array(X) y = numpy.array(y) total = len(y) indices = range(total) shuffle(indices) training = int(round(total * ratios[0])) valid = int(round(total * ratios[1])) training_indices = indices[:training] valid_indices = indices[training:training + valid] #test = total - training - valid training_data = DenseDesignMatrix(X=X[training_indices, :], y=y[training_indices], X_labels=num_labels, y_labels=num_labels) valid_data = DenseDesignMatrix(X=X[valid_indices, :], y=y[valid_indices], X_labels=num_labels, y_labels=num_labels) test_data = DenseDesignMatrix(X=X[valid:, :], y=y[valid:], X_labels=num_labels, y_labels=num_labels) return training_data, valid_data, test_data
def test_sgd_sup(): # tests that we can run the sgd algorithm # on a supervised cost. # does not test for correctness at all, just # that the algorithm runs without dying dim = 3 m = 10 rng = np.random.RandomState([25, 9, 2012]) X = rng.randn(m, dim) idx = rng.randint(0, dim, (m, )) Y = np.zeros((m, dim)) for i in xrange(m): Y[i, idx[i]] = 1 dataset = DenseDesignMatrix(X=X, y=Y) m = 15 X = rng.randn(m, dim) idx = rng.randint(0, dim, (m,)) Y = np.zeros((m, dim)) for i in xrange(m): Y[i, idx[i]] = 1 # Including a monitoring dataset lets us test that # the monitor works with supervised data monitoring_dataset = DenseDesignMatrix(X=X, y=Y) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 cost = SupervisedDummyCost() # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) algorithm = SGD(learning_rate, cost, batch_size=batch_size, monitoring_batches=3, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=None) train.main_loop()
def next(self): next_index = self._subset_iterator.next() # convert to boolean selection sel = np.zeros(self.num_examples, dtype=bool) sel[next_index] = True next_index = sel rval = [] for data, fn in safe_izip(self._raw_data, self._convert): try: this_data = data[next_index] except TypeError: this_data = data[next_index, :] if fn: this_data = fn(this_data) if self._preprocessor is not None: d = DenseDesignMatrix(X=this_data) self._preprocessor.apply(d) this_data = d.get_design_matrix() assert not np.any(np.isnan(this_data)) rval.append(this_data) rval = tuple(rval) if not self._return_tuple and len(rval) == 1: rval, = rval return rval
def apply_ZCA_fast(patches, normalize, zca_preprocessor): patches = patches.astype(np.float32) if normalize: patches /= 255.0 dataset = DenseDesignMatrix(X = patches.T) zca_preprocessor.apply(dataset) patches = dataset.get_design_matrix() return patches.T
def test_init_bias_target_marginals(): """ Test `Softmax` layer instantiation with `init_bias_target_marginals`. """ batch_size = 5 n_features = 5 n_classes = 3 n_targets = 3 irange = 0.1 learning_rate = 0.1 X_data = np.random.random(size=(batch_size, n_features)) Y_categorical = np.asarray([[0], [1], [1], [2], [2]]) class_frequencies = np.asarray([.2, .4, .4]) categorical_dataset = DenseDesignMatrix(X_data, y=Y_categorical, y_labels=n_classes) Y_continuous = np.random.random(size=(batch_size, n_targets)) Y_means = np.mean(Y_continuous, axis=0) continuous_dataset = DenseDesignMatrix(X_data, y=Y_continuous) Y_multiclass = np.random.randint(n_classes, size=(batch_size, n_targets)) multiclass_dataset = DenseDesignMatrix(X_data, y=Y_multiclass, y_labels=n_classes) def softmax_layer(dataset): return Softmax(n_classes, 'h0', irange=irange, init_bias_target_marginals=dataset) valid_categorical_mlp = MLP( layers=[softmax_layer(categorical_dataset)], nvis=n_features ) actual = valid_categorical_mlp.layers[0].b.get_value() expected = pseudoinverse_softmax_numpy(class_frequencies) assert np.allclose(actual, expected) valid_continuous_mlp = MLP( layers=[softmax_layer(continuous_dataset)], nvis=n_features ) actual = valid_continuous_mlp.layers[0].b.get_value() expected = pseudoinverse_softmax_numpy(Y_means) assert np.allclose(actual, expected) def invalid_multiclass_mlp(): return MLP( layers=[softmax_layer(multiclass_dataset)], nvis=n_features ) assert_raises(AssertionError, invalid_multiclass_mlp)
def test_bgd_unsup(): # tests that we can run the bgd algorithm # on an supervised cost. # does not test for correctness at all, just # that the algorithm runs without dying dim = 3 m = 10 rng = np.random.RandomState([25, 9, 2012]) X = rng.randn(m, dim) dataset = DenseDesignMatrix(X=X) m = 15 X = rng.randn(m, dim) # including a monitoring datasets lets us test that # the monitor works with supervised data monitoring_dataset = DenseDesignMatrix(X=X) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 class DummyCost(Cost): def expr(self, model, data): self.get_data_specs(model)[0].validate(data) X = data return T.square(model(X) - X).mean() def get_data_specs(self, model): return (model.get_input_space(), model.get_input_source()) cost = DummyCost() # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) algorithm = BGD(cost, batch_size=5, monitoring_batches=2, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=None) train.main_loop()
def make_dataset(num_batches): m = num_batches * batch_size X = rng.randn(m, num_features) y = rng.randn(m, num_features) rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning return rval
def make_dataset(num_batches): m = num_batches*batch_size X = rng.randn(m, num_features) y = rng.randn(m, num_features) rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning return rval
def test(store_inverse): preprocessed_X = copy.copy(self.X) preprocessor = ZCA(store_inverse=store_inverse) dataset = DenseDesignMatrix(X=preprocessed_X, preprocessor=preprocessor, fit_preprocessor=True) preprocessed_X = dataset.get_design_matrix() assert_allclose(self.X, preprocessor.inverse(preprocessed_X))
def testing_multiple_datasets_with_specified_dataset_in_monitor_based_lr(): # tests that the class MonitorBasedLRAdjuster in sgd.py can properly use # the spcified dataset_name in the constructor when multiple datasets # exist. dim = 3 m = 10 rng = np.random.RandomState([06, 02, 2014]) X = rng.randn(m, dim) Y = rng.randn(m, dim) learning_rate = 1e-2 batch_size = 5 # We need to include this so the test actually stops running at some point epoch_num = 1 # including a monitoring datasets lets us test that # the monitor works with supervised data monitoring_train = DenseDesignMatrix(X=X) monitoring_test = DenseDesignMatrix(X=Y) cost = DummyCost() model = SoftmaxModel(dim) dataset = DenseDesignMatrix(X=X) termination_criterion = EpochCounter(epoch_num) monitoring_dataset = {'train': monitoring_train, 'test': monitoring_test} algorithm = SGD(learning_rate, cost, batch_size=batch_size, monitoring_batches=2, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) dataset_name = monitoring_dataset.keys()[0] monitor_lr = MonitorBasedLRAdjuster(dataset_name=dataset_name) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=[monitor_lr]) train.main_loop()
class testPCA: """ Tests for PCA preprocessor """ def setup(self): rng = np.random.RandomState([1, 2, 3]) self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)), y=as_floatX(rng.randn(15, 1))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1 def test_apply_no_whiten(self): """ Confirms that PCA has decorrelated the input dataset and principal components are arranged in decreasing order by variance """ # sut is an abbreviation for System Under Test sut = PCA(self.num_components) sut.apply(self.dataset, True) cm = np.cov(self.dataset.get_design_matrix().T) # covariance matrix # testing whether the covariance matrix is a diagonal one np.testing.assert_almost_equal( cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])), np.zeros((cm.shape[0], cm.shape[0]))) # testing whether the eigenvalues are in decreasing order assert (np.diag(cm)[:-1] > np.diag(cm)[1:]).all() def test_apply_whiten(self): """ Confirms that PCA has decorrelated the input dataset and variance is the same along all principal components and equal to one """ sut = PCA(self.num_components, whiten=True) sut.apply(self.dataset, True) cm = np.cov(self.dataset.get_design_matrix().T) # covariance matrix # testing whether the covariance matrix is a diagonal one np.testing.assert_almost_equal( cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])), np.zeros((cm.shape[0], cm.shape[0]))) # testing whether the eigenvalues are all ones np.testing.assert_almost_equal(np.diag(cm), np.ones(cm.shape[0])) def test_apply_reduce_num_components(self): """ Checks whether PCA performs dimensionality reduction """ sut = PCA(self.num_components - 1, whiten=True) sut.apply(self.dataset, True) assert self.dataset.get_design_matrix().shape[1] ==\ self.num_components - 1
def __init__(self, which_set, data_path=None, term_range=None, target_type='cluster100'): """ which_set: a string specifying which portion of the dataset to load. Valid values are 'train', 'valid' or 'test' data_path: a string specifying the directory containing the webcluster data. If None (default), use environment variable WEBCLUSTER_DATA_PATH. term_range: a tuple for taking only a slice of the available terms. Default is to use all 6275. For example, an input range of (10,2000) will truncate the 10 most frequent terms and the 6275-2000=4275 les frequent terms, whereby frequency we mean how many unique documents each term is in. target_type: the type of targets to use. Valid options are 'cluster[10,100,1000]' """ self.__dict__.update(locals()) del self.self self.corpus_terms = None self.doc_info = None print "loading WebCluster DDM. which_set =", self.which_set if self.data_path is None: self.data_path \ = string_utils.preprocess('${WEBCLUSTER_DATA_PATH}') fname = os.path.join(self.data_path, which_set+'_doc_inputs.npy') X = np.load(fname) if self.term_range is not None: X = X[:,self.term_range[0]:self.term_range[1]] X = X/X.sum(1).reshape(X.shape[0],1) print X.sum(1).mean() fname = os.path.join(self.data_path, which_set+'_doc_targets.npy') # columns: 0:cluster10s, 1:cluster100s, 2:cluster1000s self.cluster_hierarchy = np.load(fname) y = None if self.target_type == 'cluster10': y = self.cluster_hierarchy[:,0] elif self.target_type == 'cluster100': y = self.cluster_hierarchy[:,1] elif self.target_type == 'cluster1000': y = self.cluster_hierarchy[:,2] elif self.target_type is None: pass else: raise NotImplementedError() DenseDesignMatrix.__init__(self, X=X, y=y) print "... WebCluster ddm loaded"
class testPCA: """ Tests for PCA preprocessor """ def setup(self): rng = np.random.RandomState([1, 2, 3]) self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)), y=as_floatX(rng.randn(15, 1))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1 def test_apply_no_whiten(self): """ Confirms that PCA has decorrelated the input dataset and principal components are arranged in decreasing order by variance """ # sut is an abbreviation for System Under Test sut = PCA(self.num_components) sut.apply(self.dataset, True) cm = np.cov(self.dataset.get_design_matrix().T) # covariance matrix # testing whether the covariance matrix is a diagonal one np.testing.assert_almost_equal(cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])), np.zeros((cm.shape[0], cm.shape[0]))) # testing whether the eigenvalues are in decreasing order assert (np.diag(cm)[:-1] > np.diag(cm)[1:]).all() def test_apply_whiten(self): """ Confirms that PCA has decorrelated the input dataset and variance is the same along all principal components and equal to one """ sut = PCA(self.num_components, whiten=True) sut.apply(self.dataset, True) cm = np.cov(self.dataset.get_design_matrix().T) # covariance matrix # testing whether the covariance matrix is a diagonal one np.testing.assert_almost_equal(cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])), np.zeros((cm.shape[0], cm.shape[0]))) # testing whether the eigenvalues are all ones np.testing.assert_almost_equal(np.diag(cm), np.ones(cm.shape[0])) def test_apply_reduce_num_components(self): """ Checks whether PCA performs dimensionality reduction """ sut = PCA(self.num_components - 1, whiten=True) sut.apply(self.dataset, True) assert self.dataset.get_design_matrix().shape[1] ==\ self.num_components - 1
def create_dense_design_matrix(x, y=None, num_classes=None): if y is None: return DenseDesignMatrix(X=x) if num_classes is None: return DenseDesignMatrix(X=x, y=y) y = y.reshape((-1, )) one_hot = np.zeros((y.shape[0], num_classes), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. return DenseDesignMatrix(X=x, y=one_hot)
def test(store_inverse): rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(15, 10)) preprocessed_X = copy.copy(X) preprocessor = ZCA(store_inverse=store_inverse) dataset = DenseDesignMatrix(X=preprocessed_X, preprocessor=preprocessor, fit_preprocessor=True) preprocessed_X = dataset.get_design_matrix() assert_allclose(X, preprocessor.inverse(preprocessed_X))
def test(store_inverse): rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(15, 10)) preprocessed_X = copy.copy(X) preprocessor = ZCA(store_inverse=store_inverse) dataset = DenseDesignMatrix(X=preprocessed_X, preprocessor=preprocessor, fit_preprocessor=True) preprocessed_X = dataset.get_design_matrix() assert_allclose(X, preprocessor.inverse(preprocessed_X))
def convert_to_dataset(X,y): X = np.vstack(X); y = np.vstack(y); # convert labels y = self.label_converter.get_labels(y, self.label_mode); y = np.hstack(y); one_hot_y = one_hot(y); dataset = DenseDesignMatrix(X=X, y=one_hot_y); dataset.labels = y; # for confusion matrix return dataset;
def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches*batch_size X = rng.randn(m, num_features) y = np.zeros((m,1)) y[:,0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval
def convert_to_dataset(X, y): X = np.vstack(X) y = np.vstack(y) # convert labels y = self.label_converter.get_labels(y, self.label_mode) y = np.hstack(y) one_hot_y = one_hot(y) dataset = DenseDesignMatrix(X=X, y=one_hot_y) dataset.labels = y # for confusion matrix return dataset
def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches * batch_size X = rng.randn(m, num_features) y = np.zeros((m, 1)) y[:, 0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval
def testing_multiple_datasets_in_monitor_based_lr(): # tests that the class MonitorBasedLRAdjuster in sgd.py does not take multiple datasets in which multiple channels ending in '_objectives' exist. # This case happens when the user has not specified either channel_name or dataset_name in the constructor dim = 3 m = 10 rng = np.random.RandomState([06,02,2014]) X = rng.randn(m, dim) Y = rng.randn(m, dim) learning_rate = 1e-2 batch_size = 5 # We need to include this so the test actually stops running at some point epoch_num = 1 # including a monitoring datasets lets us test that # the monitor works with supervised data monitoring_train = DenseDesignMatrix(X=X) monitoring_test = DenseDesignMatrix(X=Y) cost = DummyCost() model = SoftmaxModel(dim) dataset = DenseDesignMatrix(X=X) termination_criterion = EpochCounter(epoch_num) algorithm = SGD(learning_rate, cost, batch_size=5, monitoring_batches=2, monitoring_dataset= {'train': monitoring_train, 'test' : monitoring_test}, termination_criterion=termination_criterion, update_callbacks=None, init_momentum = None, set_batch_size = False) monitor_lr = MonitorBasedLRAdjuster() train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=[monitor_lr]) try: train.main_loop() except ValueError: return raise AssertionError("MonitorBasedLRAdjuster takes multiple dataset names in which more than one \"objective\" channel exist and the user has not specified " + "either channel_name or database_name in the constructor to disambiguate.")
def __init__(self, raw, transformer, cpu_only=False, space_preserving=False, block_length=1): """ .. todo:: WRITEME properly Parameters ---------- raw : pylearn2 Dataset Provides raw data transformer: pylearn2 Block To transform the data block_length: timeseries length Amount of elements of the timeseries """ assert block_length >= 1 if block_length != 1: timeseries = Timeseries(X=raw, block_length=block_length) super(TimeseriesTransformerDataset, self).__init__(timeseries, transformer, cpu_only, space_preserving) else: raw = DenseDesignMatrix(X=raw) super(TimeseriesTransformerDataset, self).__init__(raw, transformer, cpu_only, space_preserving)
def load_data(start, stop): # Loads the 1 million images into X and creates a DenseDesignMatrix # for use in a Denoising Autoencoder which is later used in a sDAE. # Returns: dataset: DenseDesignMatrix(start, stop) #dataset_location = "~/catkin_ws/src/athomesoftware/datasets/tinyimages/" #dataset_location = "~/catkin_ws/src/athomesoftware/datasets/cifar10/" X = [] y = [] print("Loading images from " + dataset_location) for images in os.walk(dataset_location): if (images.endswith('.png')): im = Image.open(images) im.reshape(3, 32, 32) row = list(im.getData()) X.append(row) print("Images loaded from " + dataset_location) X = np.asarray(X) y = np.asarray(y) y = y.reshape(y.shape[0], 1) X = X[start:stop, :] y = y[start:stop, :] print("Creating design matrix " + dataset_location) return DenseDesignMatrix(X=X, y=y)
def test_execution_order(): # ensure save is called directly after monitoring by checking # parameter values in `on_monitor` and `on_save`. model = MLP(layers=[Softmax(layer_name='y', n_classes=2, irange=0.)], nvis=3) dataset = DenseDesignMatrix(X=np.random.normal(size=(6, 3)), y=np.random.normal(size=(6, 2))) epoch_counter = EpochCounter(max_epochs=1) algorithm = SGD(batch_size=2, learning_rate=0.1, termination_criterion=epoch_counter) extension = ParamMonitor() train = Train(dataset=dataset, model=model, algorithm=algorithm, extensions=[extension], save_freq=1, save_path="save.pkl") # mock save train.save = MethodType(only_run_extensions, train) train.main_loop()
def setup(self): """ We use a small predefined 8x5 matrix for which we know the ZCA transform. """ self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0], [7.0, 26.0, 26.0, 26.0, -3.0], [17.0, -17.0, -37.0, -36.0, -11.0], [19.0, 15.0, -2.0, 5.0, 9.0], [-3.0, -8.0, -35.0, -25.0, -8.0], [-18.0, 3.0, 4.0, 15.0, 14.0], [5.0, -4.0, -5.0, -7.0, -11.0], [23.0, 22.0, 15.0, 20.0, 12.0]]) self.dataset = DenseDesignMatrix(X=as_floatX(self.X), y=as_floatX(np.ones((8, 1)))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1
def load(start, stop, datadir='data/CK'): im_list = glob.glob(os.path.join(datadir, 'faces_aligned/*.png'))[start:] if not im_list: msg = ('No image files found in: %s' % os.path.realpath(os.path.join(datadir, 'faces_aligned'))) log.error(msg) raise RuntimeException(msg) X = [] y = [] more_to_read = stop - start for im_file in im_list: if more_to_read <= 0: break label_base_pat = os.path.basename(im_file)[:9] + '*_emotion.txt' maybe_label_file = glob.glob( os.path.join(datadir, 'labels', label_base_pat)) if maybe_label_file: y.append(read_label(maybe_label_file[0])) imdata = imread(im_file, False) imdata = cv2.resize(imdata, (32, 32)) imdata = imdata.flatten().astype(np.float32) / 255 X.append(imdata) more_to_read -= 1 return DenseDesignMatrix(X=np.asarray(X), y=np.asarray(y).reshape(-1, 1), view_converter=DefaultViewConverter( (32, 32, 1), axes=('b', 0, 1, 'c')))
def random_dense_design_matrix(rng, num_examples, dim, num_classes): """ Creates a random dense design matrix that has class labels. Parameters ---------- rng : numpy.random.RandomState The random number generator used to generate the dataset. num_examples : int The number of examples to create. dim : int The number of features in each example. num_classes : int The number of classes to assign the examples to. 0 indicates that no class labels will be generated. """ X = rng.randn(num_examples, dim) if num_classes: Y = rng.randint(0, num_classes, (num_examples, 1)) y_labels = num_classes else: Y = None y_labels = None return DenseDesignMatrix(X=X, y=Y, y_labels=y_labels)
def test_serialization_guard(): # tests that Train refuses to serialize the dataset dim = 2 m = 11 rng = np.random.RandomState([28, 9, 2012]) X = rng.randn(m, dim) dataset = DenseDesignMatrix(X=X) model = DummyModel(dim) # make the dataset part of the model, so it will get # serialized model.dataset = dataset Monitor.get_monitor(model) algorithm = DummyAlgorithm() train = Train(dataset, model, algorithm, save_path='_tmp_unit_test.pkl', save_freq=1, callbacks=None) try: train.main_loop() except RuntimeError: return assert False # train did not complain, this is a bug
def monary_load(start=0, stop=-1, find_args={}, species_to_retrieve=[]): if species_to_retrieve == []: species_to_retrieve = species else: species_to_retrieve = [s for s in species_to_retrieve if s in species] query = {} for s in species_to_retrieve: query[s] = {"$gt": 0} find_args["$or"] = [{k: query[k]} for k in query.keys()] with Monary("127.0.0.1") as monary: out = monary.query( "creeval", collection, find_args, num_metadata + cat_metadata + species_to_retrieve, ["float32"] * (len(num_metadata) + len(cat_metadata) + len(species_to_retrieve)), limit=(stop - start), offset=start) for i, col in enumerate(out[0:len(num_metadata + cat_metadata)]): out[i] = np.ma.filled(col, np.ma.mean(col)) #if any(np.isnan(col)): # print col out = np.ma.row_stack(out).T X = out[:, 0:len(num_metadata + cat_metadata)] y = out[:, len(num_metadata + cat_metadata):] y = (y > 0).astype(int) scaler = StandardScaler().fit(X) X = scaler.transform(X) pickle.dump(scaler, open(collection + "_scaler.pkl", "wb")) y = np.asarray(y) return DenseDesignMatrix(X=X, y=y)
def array_to_ds(X): """ Build a DenseDesignMatrix with topo_view using X. X: a nsamples x pixels numpy array, or a list of linearized images """ if type(X) is list: X = np.asarray(X) return DenseDesignMatrix(topo_view=X.reshape(X.shape + (1,)))
def test_zero_image(self): """ Test on zero-value image if cause any division by zero """ X = as_floatX(np.zeros((5, 32 * 32 * 3))) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_zero_vector(self): """ Test that passing in the zero vector does not result in a divide by 0 """ dataset = DenseDesignMatrix(X=as_floatX(np.zeros((1, 1)))) # the settings of subtract_mean and use_norm are not relevant to # the test # std_bias = 0.0 is the only value for which there should be a risk # of failure occurring preprocessor = GlobalContrastNormalization(subtract_mean=True, sqrt_bias=0.0, use_std=True) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert not np.any(np.isnan(result)) assert not np.any(np.isinf(result))
def test_channel(self): """ Test if works fine withe different number of channel as argument """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32], channels=[1, 2]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_finitedataset_source_check(): """ Check that the FiniteDatasetIterator returns sensible errors when there is a missing source in the dataset. """ dataset = DenseDesignMatrix(X=np.random.rand(20,15).astype(theano.config.floatX), y=np.random.rand(20,5).astype(theano.config.floatX)) assert_raises(ValueError, dataset.iterator, mode='sequential', batch_size=5, data_specs=(VectorSpace(15),'featuresX')) try: dataset.iterator(mode='sequential', batch_size=5, data_specs=(VectorSpace(15),'featuresX')) except ValueError as e: assert 'featuresX' in str(e)
def test_rgb_yuv(): """ Test on a random image if the per-processor loads and works without anyerror and doesn't result in any nan or inf values """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = RGB_YUV() dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_random_image(self): """ Test on a random image if the per-processor loads and works without anyerror and doesn't result in any nan or inf values """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ["b", 0, 1, "c"] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert not np.any(np.isnan(result)) assert not np.any(np.isinf(result))
def test_split_nfold_datasets(): #Load and create ddm from cifar100 path = "/data/lisa/data/cifar100/cifar-100-python/train" obj = serial.load(path) X = obj['data'] assert X.max() == 255. assert X.min() == 0. X = np.cast['float32'](X) y = None #not implemented yet view_converter = DefaultViewConverter((32,32,3)) ddm = DenseDesignMatrix(X = X, y =y, view_converter = view_converter) assert not np.any(np.isnan(ddm.X)) ddm.y_fine = np.asarray(obj['fine_labels']) ddm.y_coarse = np.asarray(obj['coarse_labels']) folds = ddm.split_dataset_nfolds(10) print folds[0].shape
def test_extract_reassemble(): """ Tests that ExtractGridPatches and ReassembleGridPatches are inverse of each other """ rng = np.random.RandomState([1, 3, 7]) topo = rng.randn(4, 3 * 5, 3 * 7, 2) dataset = DenseDesignMatrix(topo_view=topo) patch_shape = (3, 7) extractor = ExtractGridPatches(patch_shape, patch_shape) reassemblor = ReassembleGridPatches(patch_shape=patch_shape, orig_shape=topo.shape[1:3]) dataset.apply_preprocessor(extractor) dataset.apply_preprocessor(reassemblor) new_topo = dataset.get_topological_view() assert new_topo.shape == topo.shape if not np.all(new_topo == topo): assert False
def test_split_datasets(): #Load and create ddm from cifar100 path = "/data/lisa/data/cifar100/cifar-100-python/train" obj = serial.load(path) X = obj['data'] assert X.max() == 255. assert X.min() == 0. X = np.cast['float32'](X) y = None #not implemented yet view_converter = DefaultViewConverter((32,32,3)) ddm = DenseDesignMatrix(X = X, y =y, view_converter = view_converter) assert not np.any(np.isnan(ddm.X)) ddm.y_fine = np.asarray(obj['fine_labels']) ddm.y_coarse = np.asarray(obj['coarse_labels']) (train, valid) = ddm.split_dataset_holdout(train_prop=0.5) assert valid.shape[0] == np.ceil(ddm.num_examples * 0.5) assert train.shape[0] == (ddm.num_examples - valid.shape[0])
def __init__(self, filename, X=None, topo_view=None, y=None, load_all=False, **kwargs): if 'preprocessor' in kwargs: if ('fit_preprocessor' in kwargs and kwargs['fit_preprocessor'] is False) or ('fit_preprocessor' not in kwargs): self._preprocessor = kwargs['preprocessor'] kwargs['preprocessor'] = None else: self._preprocessor = None self.load_all = load_all if h5py is None: raise RuntimeError("Could not import h5py.") self._file = h5py.File(filename) if X is not None: X = self.get_dataset(X, load_all) if topo_view is not None: topo_view = self.get_dataset(topo_view, load_all) if y is not None: y = self.get_dataset(y, load_all) DenseDesignMatrix.__init__(self, X=X, topo_view=topo_view, y=y, **kwargs)
def setup(self): """ We use a small predefined 8x5 matrix for which we know the ZCA transform. """ self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0], [7.0, 26.0, 26.0, 26.0, -3.0], [17.0, -17.0, -37.0, -36.0, -11.0], [19.0, 15.0, -2.0, 5.0, 9.0], [-3.0, -8.0, -35.0, -25.0, -8.0], [-18.0, 3.0, 4.0, 15.0, 14.0], [5.0, -4.0, -5.0, -7.0, -11.0], [23.0, 22.0, 15.0, 20.0, 12.0]]) self.dataset = DenseDesignMatrix(X=as_floatX(self.X), y=as_floatX(np.ones((8, 1)))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1
def test_init_with_X_or_topo(): # tests that constructing with topo_view works # tests that construction with design matrix works # tests that conversion from topo_view to design matrix and back works # tests that conversion the other way works too rng = np.random.RandomState([1, 2, 3]) topo_view = rng.randn(5, 2, 2, 3) d1 = DenseDesignMatrix(topo_view=topo_view) X = d1.get_design_matrix() d2 = DenseDesignMatrix(X=X, view_converter=d1.view_converter) topo_view_2 = d2.get_topological_view() assert np.allclose(topo_view, topo_view_2) X = rng.randn(*X.shape) topo_view_3 = d2.get_topological_view(X) X2 = d2.get_design_matrix(topo_view_3) assert np.allclose(X, X2)
def __init__(self, patient_id, which_set, list_features, leave_out_seizure_idx_valid, leave_out_seizure_idx_test, data_dir, preictal_sec, use_all_nonictals, preprocessor_dir, n_selected_features=-1, batch_size=None, balance_class=True, axes=('b', 0, 1, 'c'), default_seed=0): self.balance_class = balance_class self.batch_size = batch_size tmp_list_features = np.empty(len(list_features), dtype=object) for f_idx in range(len(list_features)): tmp_list_features[f_idx] = FeatureList.get_info(list_features[f_idx]) list_features = tmp_list_features print 'List of features:' for f in list_features: print f['feature'] + '.' + f['param'] print '' EpilepsiaeFeatureLoader.__init__(self, patient_id=patient_id, which_set=which_set, list_features=list_features, leave_out_seizure_idx_valid=leave_out_seizure_idx_valid, leave_out_seizure_idx_test=leave_out_seizure_idx_test, data_dir=data_dir, preictal_sec=preictal_sec, use_all_nonictals=use_all_nonictals) # Row: samples, Col: features raw_X, y = self.load_data() if n_selected_features != -1: all_rank_df = None for f_idx, feature in enumerate(self.list_features): rank_df = pd.read_csv(os.path.join(data_dir, patient_id + '/rank_feature_idx_' + feature['param'] + '_' + 'leaveout_' + str(leave_out_seizure_idx_valid) + '_' + str(leave_out_seizure_idx_test) + '.txt')) if f_idx == 0: all_rank_df = rank_df else: offset_f_idx = 0 for i in range(f_idx): offset_f_idx = offset_f_idx + self.list_features[i]['n_features'] rank_df['feature_idx'] = rank_df['feature_idx'].values + offset_f_idx all_rank_df = pd.concat([all_rank_df, rank_df]) sorted_feature_df = all_rank_df.sort(['D_ADH'], ascending=[0]) self.selected_feature_idx = sorted_feature_df['feature_idx'][:n_selected_features] raw_X = raw_X[:, self.selected_feature_idx] else: self.selected_feature_idx = np.arange(raw_X.shape[1]) # Print shape of input data print '------------------------------' print 'Dataset: {0}'.format(self.which_set) print 'Number of samples: {0}'.format(raw_X.shape[0]) print ' Preictal samples: {0}'.format(self.preictal_samples) print ' Nonictal samples: {0}'.format(self.nonictal_samples) print ' NaN samples: {0}'.format(self.nan_non_flat_samples) print ' Note for ''train'' and ''valid_train'': number of samples will be equal without removing the nan samples.' print 'Number of features: {0}'.format(raw_X.shape[1]) print '------------------------------' # Preprocessing if which_set == 'train': scaler = preprocessing.StandardScaler() # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(raw_X) with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' + str(self.leave_out_seizure_idx_valid) + '_' + str(self.leave_out_seizure_idx_test) + '.pkl'), 'wb') as f: pickle.dump(scaler, f) preprocessed_X = scaler.transform(raw_X) else: with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' + str(self.leave_out_seizure_idx_valid) + '_' + str(self.leave_out_seizure_idx_test) + '.pkl'), 'rb') as f: scaler = pickle.load(f) preprocessed_X = scaler.transform(raw_X) raw_X = None if self.which_set == 'train' or self.which_set == 'valid_train': # Shuffle the data print '' print '*** Shuffle data ***' print '' permute_idx = np.random.permutation(preprocessed_X.shape[0]) preprocessed_X = preprocessed_X[permute_idx, :] y = y[permute_idx, :] if self.balance_class and (self.which_set == 'train' or self.which_set == 'valid_train'): self.X_full = preprocessed_X self.y_full = y (X, y) = self.get_data() else: # Zero-padding (if necessary) if not (self.batch_size is None): preprocessed_X, y = self.zero_pad(preprocessed_X, y, self.batch_size) X = preprocessed_X # Initialize DenseDesignMatrix DenseDesignMatrix.__init__(self, X=X, y=y, axes=axes)
k = 3 X = np.zeros((m*k,patch_shape[0]*patch_shape[1]*3),dtype='float32') rng = np.random.RandomState([1,2,3]) for i, img_path in enumerate(ImageIterator(path, suffix=".npy")): img = np.load(img_path) if img.shape[2] == 1: img = np.concatenate((img,img,img),axis=2) img = img.reshape(1,img.shape[0],img.shape[1],img.shape[2]) d = DenseDesignMatrix( topo_view = img, view_converter = DefaultViewConverter(img.shape[1:]) ) random_rng = np.random.RandomState([ rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)]) p = ExtractPatches( patch_shape = patch_shape, num_patches = k , rng = random_rng) d.apply_preprocessor(p) X[i*3:(i+1)*3,:] = d.X d.X = X base = '/data/lisatmp/goodfeli/darpa_imagenet_patch_%dx%d_train.' % (patch_shape[0], patch_shape[1]) d.use_design_loc(base+'npy') serial.save(base+'pkl',d)
if feature_type == 'exp_hs': feat = H * Mu1 elif feature_type == 'exp_h': feat = H elif feature_type == 'map_hs': feat = ( H > 0.5) * Mu1 else: assert False print 'compiling theano function' f = function([V],feat) print 'running theano function' feat = f(X2) feat_dataset = DenseDesignMatrix(X = feat, view_converter = DefaultViewConverter([1, 1, feat.shape[1]] ) ) print 'reassembling features' ns = 32 - size + 1 depatchifier = ReassembleGridPatches( orig_shape = (ns, ns), patch_shape=(1,1) ) feat_dataset.apply_preprocessor(depatchifier) print 'making topological view' topo_feat = feat_dataset.get_topological_view() assert topo_feat.shape[0] == X.shape[0] print 'assembling visualizer' n = np.ceil(np.sqrt(model.nhid)) pv3 = PatchViewer(grid_shape = (X.shape[0], num_filters), patch_shape=(ns,ns), is_color= False)
def setup(self): rng = np.random.RandomState([1, 2, 3]) self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)), y=as_floatX(rng.randn(15, 1))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1
class testZCA: def setup(self): """ We use a small predefined 8x5 matrix for which we know the ZCA transform. """ self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0], [7.0, 26.0, 26.0, 26.0, -3.0], [17.0, -17.0, -37.0, -36.0, -11.0], [19.0, 15.0, -2.0, 5.0, 9.0], [-3.0, -8.0, -35.0, -25.0, -8.0], [-18.0, 3.0, 4.0, 15.0, 14.0], [5.0, -4.0, -5.0, -7.0, -11.0], [23.0, 22.0, 15.0, 20.0, 12.0]]) self.dataset = DenseDesignMatrix(X=as_floatX(self.X), y=as_floatX(np.ones((8, 1)))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1 def get_preprocessed_data(self, preprocessor): X = copy.copy(self.X) dataset = DenseDesignMatrix(X=X, preprocessor=preprocessor, fit_preprocessor=True) return dataset.get_design_matrix() def test_zca(self): """ Confirm that ZCA.inv_P_ is the correct inverse of ZCA.P_. There's a lot else about the ZCA class that could be tested here. """ preprocessor = ZCA() preprocessor.fit(self.X) identity = np.identity(self.X.shape[1], theano.config.floatX) # Check some basics of transformation matrix assert preprocessor.P_.shape == (self.X.shape[1], self.X.shape[1]) assert_allclose(np.dot(preprocessor.P_, preprocessor.inv_P_), identity, rtol=1e-4) preprocessor = ZCA(filter_bias=0.0) preprocessed_X = self.get_preprocessed_data(preprocessor) # Check if preprocessed data matrix is white assert_allclose(np.cov(preprocessed_X.transpose(), bias=1), identity, rtol=1e-4) # Check if we obtain correct solution zca_transformed_X = np.array( [[-1.0199, -0.1832, 1.9528, -0.9603, -0.8162], [0.0729, 1.4142, 0.2529, 1.1861, -1.0876], [0.9575, -1.1173, -0.5435, -1.4372, -0.1057], [0.6348, 1.1258, 0.2692, -0.8893, 1.1669], [-0.9769, 0.8297, -1.8676, -0.6055, -0.5096], [-1.5700, -0.8389, -0.0931, 0.8877, 1.6089], [0.4993, -1.4219, -0.3443, 0.9664, -1.1022], [1.4022, 0.1917, 0.3736, 0.8520, 0.8456]] ) assert_allclose(preprocessed_X, zca_transformed_X, rtol=1e-3) def test_num_components(self): # Keep 3 components preprocessor = ZCA(filter_bias=0.0, n_components=3) preprocessed_X = self.get_preprocessed_data(preprocessor) zca_truncated_X = np.array( [[-0.8938, -0.3084, 1.1105, 0.1587, -1.4073], [0.3346, 0.5193, 1.1371, 0.6545, -0.4199], [0.7613, -0.4823, -1.0578, -1.1997, -0.4993], [0.9250, 0.5012, -0.2743, 0.1735, 0.8105], [-0.4928, -0.6319, -1.0359, -0.7173, 0.1469], [-1.8060, -0.1758, -0.2943, 0.7208, 1.4359], [0.0079, -0.2582, 0.1368, -0.3571, -0.8147], [1.1636, 0.8362, 0.2777, 0.5666, 0.7480]] ) assert_allclose(zca_truncated_X, preprocessed_X, rtol=1e-3) # Drop 2 components: result should be similar preprocessor = ZCA(filter_bias=0.0, n_drop_components=2) preprocessed_X = self.get_preprocessed_data(preprocessor) assert_allclose(zca_truncated_X, preprocessed_X, rtol=1e-3) def test_zca_inverse(self): """ Calculates the inverse of X with numpy.linalg.inv if inv_P_ is not stored. """ def test(store_inverse): preprocessed_X = copy.copy(self.X) preprocessor = ZCA(store_inverse=store_inverse) dataset = DenseDesignMatrix(X=preprocessed_X, preprocessor=preprocessor, fit_preprocessor=True) preprocessed_X = dataset.get_design_matrix() assert_allclose(self.X, preprocessor.inverse(preprocessed_X)) test(store_inverse=True) test(store_inverse=False) def test_zca_dtypes(self): """ Confirm that ZCA.fit works regardless of dtype of data and config.floatX """ orig_floatX = config.floatX try: for floatX in ['float32', 'float64']: for dtype in ['float32', 'float64']: preprocessor = ZCA() preprocessor.fit(self.X) finally: config.floatX = orig_floatX
def get_preprocessed_data(self, preprocessor): X = copy.copy(self.X) dataset = DenseDesignMatrix(X=X, preprocessor=preprocessor, fit_preprocessor=True) return dataset.get_design_matrix()