Пример #1
0
 def __init__(self,
              filename,
              X=None,
              topo_view=None,
              y=None,
              load_all=False,
              **kwargs):
     if 'preprocessor' in kwargs:
         if ('fit_preprocessor' in kwargs and
                 kwargs['fit_preprocessor'] is False) or ('fit_preprocessor'
                                                          not in kwargs):
             self._preprocessor = kwargs['preprocessor']
             kwargs['preprocessor'] = None
     else:
         self._preprocessor = None
     self.load_all = load_all
     if h5py is None:
         raise RuntimeError("Could not import h5py.")
     self._file = h5py.File(filename)
     if X is not None:
         X = self.get_dataset(X, load_all)
     if topo_view is not None:
         topo_view = self.get_dataset(topo_view, load_all)
     if y is not None:
         y = self.get_dataset(y, load_all)
     DenseDesignMatrix.__init__(self,
                                X=X,
                                topo_view=topo_view,
                                y=y,
                                **kwargs)
Пример #2
0
def test_multiple_monitoring_datasets():
    # tests that DefaultTrainingAlgorithm can take multiple
    # monitoring datasets.

    BATCH_SIZE = 2
    BATCHES = 3
    NUM_FEATURES = 4
    dim = 3
    m = 10

    rng = np.random.RandomState([2014, 02, 25])
    X = rng.randn(m, dim)
    Y = rng.randn(m, dim)

    train = DenseDesignMatrix(X=X)
    test = DenseDesignMatrix(X=Y)

    algorithm = DefaultTrainingAlgorithm(
        batch_size=BATCH_SIZE,
        batches_per_iter=BATCHES,
        monitoring_dataset={'train': train, 'test': test})

    model = S3C(nvis=NUM_FEATURES, nhid=1,
                irange=.01, init_bias_hid=0., init_B=1.,
                min_B=1., max_B=1., init_alpha=1.,
                min_alpha=1., max_alpha=1., init_mu=0.,
                m_step=Grad_M_Step(learning_rate=0.),
                e_step=E_Step(h_new_coeff_schedule=[1.]))

    algorithm.setup(model=model, dataset=train)
    algorithm.train(dataset=train)
def test_convert_to_one_hot():
    rng = np.random.RandomState([2013, 11, 14])
    m = 11
    d = DenseDesignMatrix(
        X=rng.randn(m, 4),
        y=rng.randint(low=0, high=10, size=(m,)))
    d.convert_to_one_hot()
Пример #4
0
def get_feats_from_cnn(rows, model=None):
    """
    fprop rows using best trained model and returns activations of the
    penultimate layer
    """
    conf = utils.get_config()
    patch_size = conf['patch_size']
    region_size = conf['region_size']
    batch_size = None
    preds = utils.get_predictor(model=model, return_all=True)
    y = np.zeros(len(rows))
    samples = np.zeros(
        (len(rows), region_size, region_size, 1), dtype=np.float32)
    for i, row in enumerate(rows):
        print 'processing %i-th image: %s' % (i, row['image_filename'])
        try:
            samples[i] = utils.get_samples_from_image(row, False)[0]
        except ValueError as e:
            print '{1} Value error: {0}'.format(str(e), row['image_filename'])
        y[i] = utils.is_positive(row)
    ds = DenseDesignMatrix(topo_view=samples)
    pipeline = utils.get_pipeline(
        ds.X_topo_space.shape, patch_size, batch_size)
    pipeline.apply(ds)
    return preds[-2](ds.get_topological_view()), y
Пример #5
0
def get_feats_from_cnn(rows, model=None):
    """
    fprop rows using best trained model and returns activations of the
    penultimate layer
    """
    conf = utils.get_config()
    patch_size = conf['patch_size']
    region_size = conf['region_size']
    batch_size = None
    preds = utils.get_predictor(model=model, return_all=True)
    y = np.zeros(len(rows))
    samples = np.zeros((len(rows), region_size, region_size, 1),
                       dtype=np.float32)
    for i, row in enumerate(rows):
        print 'processing %i-th image: %s' % (i, row['image_filename'])
        try:
            samples[i] = utils.get_samples_from_image(row, False)[0]
        except ValueError as e:
            print '{1} Value error: {0}'.format(str(e), row['image_filename'])
        y[i] = utils.is_positive(row)
    ds = DenseDesignMatrix(topo_view=samples)
    pipeline = utils.get_pipeline(ds.X_topo_space.shape, patch_size,
                                  batch_size)
    pipeline.apply(ds)
    return preds[-2](ds.get_topological_view()), y
Пример #6
0
    def test_unit_norm(self):
        """ Test that using std_bias = 0.0 and use_norm = True
            results in vectors having unit norm """

        tol = 1e-5

        num_examples = 5
        num_features = 10

        rng = np.random.RandomState([1, 2, 3])

        X = as_floatX(rng.randn(num_examples, num_features))

        dataset = DenseDesignMatrix(X=X)

        # the setting of subtract_mean is not relevant to the test
        # the test only applies when std_bias = 0.0 and use_std = False
        preprocessor = GlobalContrastNormalization(subtract_mean=False,
                                                   sqrt_bias=0.0,
                                                   use_std=False)

        dataset.apply_preprocessor(preprocessor)

        result = dataset.get_design_matrix()

        norms = np.sqrt(np.square(result).sum(axis=1))

        max_norm_error = np.abs(norms - 1.).max()

        tol = 3e-5

        assert max_norm_error < tol
Пример #7
0
def test_from_dataset():
    """
    Tests whether it supports integer labels.
    """
    rng = np.random.RandomState([1, 2, 3])
    topo_view = rng.randn(12, 2, 3, 3)
    y = rng.randint(0, 5, (12, 1))

    # without y:
    d1 = DenseDesignMatrix(topo_view=topo_view)
    slice_d = from_dataset(d1, 5)
    assert slice_d.X.shape[1] == d1.X.shape[1]
    assert slice_d.X.shape[0] == 5

    # with y:
    d2 = DenseDesignMatrix(topo_view=topo_view, y=y)
    slice_d = from_dataset(d2, 5)
    assert slice_d.X.shape[1] == d2.X.shape[1]
    assert slice_d.X.shape[0] == 5
    assert slice_d.y.shape[0] == 5

    # without topo_view:
    x = topo_view.reshape(12, 18)
    d3 = DenseDesignMatrix(X=x, y=y)
    slice_d = from_dataset(d3, 5)
    assert slice_d.X.shape[1] == d3.X.shape[1]
    assert slice_d.X.shape[0] == 5
    assert slice_d.y.shape[0] == 5
Пример #8
0
 def get_matrices(self, n=3, ratios=[.7, .15, .15]):
     X = []
     y = []
     for ngr, w in self.iterate_ngram_training(n):
         X.append(ngr)
         y.append([w])
     X = numpy.array(X)
     y = numpy.array(y)
     total = len(y)
     training = round(total * ratios[0])
     valid = training + round(total * ratios[1])
     #test = total - training - valid
     labels = len(self.vocab)
     training_data = DenseDesignMatrix(X=X[:training, :],
                                       y=y[:training],
                                       X_labels=labels,
                                       y_labels=labels)
     valid_data = DenseDesignMatrix(X=X[training:valid, :],
                                    y=y[training:valid],
                                    X_labels=labels,
                                    y_labels=labels)
     test_data = DenseDesignMatrix(X=X[valid:, :],
                                   y=y[valid:],
                                   X_labels=labels,
                                   y_labels=labels)
     return training_data, valid_data, test_data
Пример #9
0
    def test_unit_norm(self):
        """ Test that using std_bias = 0.0 and use_norm = True
            results in vectors having unit norm """

        tol = 1e-5

        num_examples = 5
        num_features = 10

        rng = np.random.RandomState([1, 2, 3])

        X = as_floatX(rng.randn(num_examples, num_features))

        dataset = DenseDesignMatrix(X=X)

        # the setting of subtract_mean is not relevant to the test
        # the test only applies when std_bias = 0.0 and use_std = False
        preprocessor = GlobalContrastNormalization(subtract_mean=False,
                                                   sqrt_bias=0.0,
                                                   use_std=False)

        dataset.apply_preprocessor(preprocessor)

        result = dataset.get_design_matrix()

        norms = np.sqrt(np.square(result).sum(axis=1))

        max_norm_error = np.abs(norms - 1.).max()

        tol = 3e-5

        assert max_norm_error < tol
Пример #10
0
def test_convert_to_one_hot():
    rng = np.random.RandomState([2013, 11, 14])
    m = 11
    d = DenseDesignMatrix(
        X=rng.randn(m, 4),
        y=rng.randint(low=0, high=10, size=(m,)))
    d.convert_to_one_hot()
Пример #11
0
 def create_batch_matrices(self, ratios=[.7, .15, .15]):
     res = self.read_batch()
     if res is None:
         return None
     X, y = res
     num_labels = len(self.needed) + 1  # for filtered words
     X = numpy.array(X)
     y = numpy.array(y)
     total = len(y)
     indices = range(total)
     shuffle(indices)
     training = int(round(total * ratios[0]))
     valid = int(round(total * ratios[1]))
     training_indices = indices[:training]
     valid_indices = indices[training:training + valid]
     #test = total - training - valid
     training_data = DenseDesignMatrix(X=X[training_indices, :],
                                       y=y[training_indices],
                                       X_labels=num_labels,
                                       y_labels=num_labels)
     valid_data = DenseDesignMatrix(X=X[valid_indices, :],
                                    y=y[valid_indices],
                                    X_labels=num_labels,
                                    y_labels=num_labels)
     test_data = DenseDesignMatrix(X=X[valid:, :],
                                   y=y[valid:],
                                   X_labels=num_labels,
                                   y_labels=num_labels)
     return training_data, valid_data, test_data
Пример #12
0
def test_sgd_sup():

    # tests that we can run the sgd algorithm
    # on a supervised cost.
    # does not test for correctness at all, just
    # that the algorithm runs without dying

    dim = 3
    m = 10

    rng = np.random.RandomState([25, 9, 2012])

    X = rng.randn(m, dim)

    idx = rng.randint(0, dim, (m, ))
    Y = np.zeros((m, dim))
    for i in xrange(m):
        Y[i, idx[i]] = 1

    dataset = DenseDesignMatrix(X=X, y=Y)

    m = 15
    X = rng.randn(m, dim)

    idx = rng.randint(0, dim, (m,))
    Y = np.zeros((m, dim))
    for i in xrange(m):
        Y[i, idx[i]] = 1

    # Including a monitoring dataset lets us test that
    # the monitor works with supervised data
    monitoring_dataset = DenseDesignMatrix(X=X, y=Y)

    model = SoftmaxModel(dim)

    learning_rate = 1e-3
    batch_size = 5

    cost = SupervisedDummyCost()

    # We need to include this so the test actually stops running at some point
    termination_criterion = EpochCounter(5)

    algorithm = SGD(learning_rate, cost,
                    batch_size=batch_size,
                    monitoring_batches=3,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path=None,
                  save_freq=0,
                  extensions=None)

    train.main_loop()
Пример #13
0
    def next(self):
        next_index = self._subset_iterator.next()

        # convert to boolean selection
        sel = np.zeros(self.num_examples, dtype=bool)
        sel[next_index] = True
        next_index = sel

        rval = []
        for data, fn in safe_izip(self._raw_data, self._convert):
            try:
                this_data = data[next_index]
            except TypeError:
                this_data = data[next_index, :]
            if fn:
                this_data = fn(this_data)
            if self._preprocessor is not None:
                d = DenseDesignMatrix(X=this_data)
                self._preprocessor.apply(d)
                this_data = d.get_design_matrix()
            assert not np.any(np.isnan(this_data))
            rval.append(this_data)
        rval = tuple(rval)
        if not self._return_tuple and len(rval) == 1:
            rval, = rval
        return rval    
Пример #14
0
def apply_ZCA_fast(patches, normalize, zca_preprocessor):
    patches = patches.astype(np.float32)
    if normalize:
        patches /= 255.0
    dataset = DenseDesignMatrix(X = patches.T)    
    zca_preprocessor.apply(dataset)
    patches = dataset.get_design_matrix()
    return patches.T
Пример #15
0
def test_init_bias_target_marginals():
    """
    Test `Softmax` layer instantiation with `init_bias_target_marginals`.
    """
    batch_size = 5
    n_features = 5
    n_classes = 3
    n_targets = 3
    irange = 0.1
    learning_rate = 0.1

    X_data = np.random.random(size=(batch_size, n_features))

    Y_categorical = np.asarray([[0], [1], [1], [2], [2]])
    class_frequencies = np.asarray([.2, .4, .4])
    categorical_dataset = DenseDesignMatrix(X_data,
                                            y=Y_categorical,
                                            y_labels=n_classes)

    Y_continuous = np.random.random(size=(batch_size, n_targets))
    Y_means = np.mean(Y_continuous, axis=0)
    continuous_dataset = DenseDesignMatrix(X_data,
                                           y=Y_continuous)

    Y_multiclass = np.random.randint(n_classes,
                                     size=(batch_size, n_targets))
    multiclass_dataset = DenseDesignMatrix(X_data,
                                           y=Y_multiclass,
                                           y_labels=n_classes)

    def softmax_layer(dataset):
        return Softmax(n_classes, 'h0', irange=irange,
                       init_bias_target_marginals=dataset)

    valid_categorical_mlp = MLP(
        layers=[softmax_layer(categorical_dataset)],
        nvis=n_features
    )

    actual = valid_categorical_mlp.layers[0].b.get_value()
    expected = pseudoinverse_softmax_numpy(class_frequencies)
    assert np.allclose(actual, expected)

    valid_continuous_mlp = MLP(
        layers=[softmax_layer(continuous_dataset)],
        nvis=n_features
    )

    actual = valid_continuous_mlp.layers[0].b.get_value()
    expected = pseudoinverse_softmax_numpy(Y_means)
    assert np.allclose(actual, expected)

    def invalid_multiclass_mlp():
        return MLP(
            layers=[softmax_layer(multiclass_dataset)],
            nvis=n_features
        )
    assert_raises(AssertionError, invalid_multiclass_mlp)
Пример #16
0
def test_bgd_unsup():

    # tests that we can run the bgd algorithm
    # on an supervised cost.
    # does not test for correctness at all, just
    # that the algorithm runs without dying

    dim = 3
    m = 10

    rng = np.random.RandomState([25, 9, 2012])

    X = rng.randn(m, dim)

    dataset = DenseDesignMatrix(X=X)

    m = 15
    X = rng.randn(m, dim)

    # including a monitoring datasets lets us test that
    # the monitor works with supervised data
    monitoring_dataset = DenseDesignMatrix(X=X)

    model = SoftmaxModel(dim)

    learning_rate = 1e-3
    batch_size = 5

    class DummyCost(Cost):
        def expr(self, model, data):
            self.get_data_specs(model)[0].validate(data)
            X = data
            return T.square(model(X) - X).mean()

        def get_data_specs(self, model):
            return (model.get_input_space(), model.get_input_source())

    cost = DummyCost()

    # We need to include this so the test actually stops running at some point
    termination_criterion = EpochCounter(5)

    algorithm = BGD(cost,
                    batch_size=5,
                    monitoring_batches=2,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion)

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path=None,
                  save_freq=0,
                  extensions=None)

    train.main_loop()
Пример #17
0
    def make_dataset(num_batches):
        m = num_batches * batch_size
        X = rng.randn(m, num_features)
        y = rng.randn(m, num_features)

        rval = DenseDesignMatrix(X=X, y=y)

        rval.yaml_src = ""  # suppress no yaml_src warning

        return rval
Пример #18
0
    def make_dataset(num_batches):
        m = num_batches*batch_size
        X = rng.randn(m, num_features)
        y = rng.randn(m, num_features)

        rval =  DenseDesignMatrix(X=X, y=y)

        rval.yaml_src = "" # suppress no yaml_src warning

        return rval
Пример #19
0
        def test(store_inverse):
            preprocessed_X = copy.copy(self.X)
            preprocessor = ZCA(store_inverse=store_inverse)

            dataset = DenseDesignMatrix(X=preprocessed_X,
                                        preprocessor=preprocessor,
                                        fit_preprocessor=True)

            preprocessed_X = dataset.get_design_matrix()
            assert_allclose(self.X, preprocessor.inverse(preprocessed_X))
Пример #20
0
def testing_multiple_datasets_with_specified_dataset_in_monitor_based_lr():
    # tests that the class MonitorBasedLRAdjuster in sgd.py can properly use
    # the spcified dataset_name in the constructor when multiple datasets
    # exist.

    dim = 3
    m = 10

    rng = np.random.RandomState([06, 02, 2014])

    X = rng.randn(m, dim)
    Y = rng.randn(m, dim)

    learning_rate = 1e-2
    batch_size = 5

    # We need to include this so the test actually stops running at some point
    epoch_num = 1

    # including a monitoring datasets lets us test that
    # the monitor works with supervised data
    monitoring_train = DenseDesignMatrix(X=X)
    monitoring_test = DenseDesignMatrix(X=Y)

    cost = DummyCost()

    model = SoftmaxModel(dim)

    dataset = DenseDesignMatrix(X=X)

    termination_criterion = EpochCounter(epoch_num)

    monitoring_dataset = {'train': monitoring_train, 'test': monitoring_test}

    algorithm = SGD(learning_rate,
                    cost,
                    batch_size=batch_size,
                    monitoring_batches=2,
                    monitoring_dataset=monitoring_dataset,
                    termination_criterion=termination_criterion,
                    update_callbacks=None,
                    init_momentum=None,
                    set_batch_size=False)

    dataset_name = monitoring_dataset.keys()[0]
    monitor_lr = MonitorBasedLRAdjuster(dataset_name=dataset_name)

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path=None,
                  save_freq=0,
                  extensions=[monitor_lr])

    train.main_loop()
Пример #21
0
class testPCA:
    """
    Tests for PCA preprocessor
    """
    def setup(self):
        rng = np.random.RandomState([1, 2, 3])
        self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)),
                                         y=as_floatX(rng.randn(15, 1)))
        self.num_components = self.dataset.get_design_matrix().shape[1] - 1

    def test_apply_no_whiten(self):
        """
        Confirms that PCA has decorrelated the input dataset and
        principal components are arranged in decreasing order by variance
        """
        # sut is an abbreviation for System Under Test
        sut = PCA(self.num_components)
        sut.apply(self.dataset, True)
        cm = np.cov(self.dataset.get_design_matrix().T)  # covariance matrix

        # testing whether the covariance matrix is a diagonal one
        np.testing.assert_almost_equal(
            cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])),
            np.zeros((cm.shape[0], cm.shape[0])))

        # testing whether the eigenvalues are in decreasing order
        assert (np.diag(cm)[:-1] > np.diag(cm)[1:]).all()

    def test_apply_whiten(self):
        """
        Confirms that PCA has decorrelated the input dataset and
        variance is the same along all principal components and equal to one
         """
        sut = PCA(self.num_components, whiten=True)
        sut.apply(self.dataset, True)
        cm = np.cov(self.dataset.get_design_matrix().T)  # covariance matrix

        # testing whether the covariance matrix is a diagonal one
        np.testing.assert_almost_equal(
            cm * (np.ones(cm.shape[0]) - np.eye(cm.shape[0])),
            np.zeros((cm.shape[0], cm.shape[0])))

        # testing whether the eigenvalues are all ones
        np.testing.assert_almost_equal(np.diag(cm), np.ones(cm.shape[0]))

    def test_apply_reduce_num_components(self):
        """
        Checks whether PCA performs dimensionality reduction
        """
        sut = PCA(self.num_components - 1, whiten=True)
        sut.apply(self.dataset, True)

        assert self.dataset.get_design_matrix().shape[1] ==\
            self.num_components - 1
Пример #22
0
 def __init__(self, which_set, data_path=None, 
              term_range=None, target_type='cluster100'):
     """
     which_set: a string specifying which portion of the dataset
         to load. Valid values are 'train', 'valid' or 'test'
     data_path: a string specifying the directory containing the 
         webcluster data. If None (default), use environment 
         variable WEBCLUSTER_DATA_PATH.
     term_range: a tuple for taking only a slice of the available
         terms. Default is to use all 6275. For example, an input
         range of (10,2000) will truncate the 10 most frequent terms
         and the 6275-2000=4275 les frequent terms, whereby frequency
         we mean how many unique documents each term is in.
     target_type: the type of targets to use. Valid options are 
         'cluster[10,100,1000]'
     """
     self.__dict__.update(locals())
     del self.self
     
     self.corpus_terms = None
     self.doc_info = None
     
     print "loading WebCluster DDM. which_set =", self.which_set
     
     if self.data_path is None:
         self.data_path \
             = string_utils.preprocess('${WEBCLUSTER_DATA_PATH}')
     
     fname = os.path.join(self.data_path, which_set+'_doc_inputs.npy')
     X = np.load(fname)
     if self.term_range is not None:
         X = X[:,self.term_range[0]:self.term_range[1]]
         X = X/X.sum(1).reshape(X.shape[0],1)
     print X.sum(1).mean()
     
     fname = os.path.join(self.data_path, which_set+'_doc_targets.npy')
     # columns: 0:cluster10s, 1:cluster100s, 2:cluster1000s
     self.cluster_hierarchy = np.load(fname)
     
     y = None
     if self.target_type == 'cluster10':
         y = self.cluster_hierarchy[:,0]
     elif self.target_type == 'cluster100':
         y = self.cluster_hierarchy[:,1]
     elif self.target_type == 'cluster1000':
         y = self.cluster_hierarchy[:,2]
     elif self.target_type is None:
         pass
     else:
         raise NotImplementedError()
     
     DenseDesignMatrix.__init__(self, X=X, y=y)
     
     print "... WebCluster ddm loaded"
Пример #23
0
class testPCA:
    """
    Tests for PCA preprocessor
    """
    def setup(self):
        rng = np.random.RandomState([1, 2, 3])
        self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)),
                                         y=as_floatX(rng.randn(15, 1)))
        self.num_components = self.dataset.get_design_matrix().shape[1] - 1

    def test_apply_no_whiten(self):
        """
        Confirms that PCA has decorrelated the input dataset and
        principal components are arranged in decreasing order by variance
        """
        # sut is an abbreviation for System Under Test
        sut = PCA(self.num_components)
        sut.apply(self.dataset, True)
        cm = np.cov(self.dataset.get_design_matrix().T)  # covariance matrix

        # testing whether the covariance matrix is a diagonal one
        np.testing.assert_almost_equal(cm * (np.ones(cm.shape[0]) -
                                       np.eye(cm.shape[0])),
                                       np.zeros((cm.shape[0], cm.shape[0])))

        # testing whether the eigenvalues are in decreasing order
        assert (np.diag(cm)[:-1] > np.diag(cm)[1:]).all()

    def test_apply_whiten(self):
        """
        Confirms that PCA has decorrelated the input dataset and
        variance is the same along all principal components and equal to one
         """
        sut = PCA(self.num_components, whiten=True)
        sut.apply(self.dataset, True)
        cm = np.cov(self.dataset.get_design_matrix().T)  # covariance matrix

        # testing whether the covariance matrix is a diagonal one
        np.testing.assert_almost_equal(cm * (np.ones(cm.shape[0]) -
                                       np.eye(cm.shape[0])),
                                       np.zeros((cm.shape[0], cm.shape[0])))

        # testing whether the eigenvalues are all ones
        np.testing.assert_almost_equal(np.diag(cm), np.ones(cm.shape[0]))

    def test_apply_reduce_num_components(self):
        """
        Checks whether PCA performs dimensionality reduction
        """
        sut = PCA(self.num_components - 1, whiten=True)
        sut.apply(self.dataset, True)

        assert self.dataset.get_design_matrix().shape[1] ==\
            self.num_components - 1
Пример #24
0
def create_dense_design_matrix(x, y=None, num_classes=None):
    if y is None:
        return DenseDesignMatrix(X=x)

    if num_classes is None:
        return DenseDesignMatrix(X=x, y=y)

    y = y.reshape((-1, ))
    one_hot = np.zeros((y.shape[0], num_classes), dtype='float32')
    for i in xrange(y.shape[0]):
        one_hot[i, y[i]] = 1.
    return DenseDesignMatrix(X=x, y=one_hot)
Пример #25
0
    def test(store_inverse):
        rng = np.random.RandomState([1, 2, 3])
        X = as_floatX(rng.randn(15, 10))
        preprocessed_X = copy.copy(X)
        preprocessor = ZCA(store_inverse=store_inverse)

        dataset = DenseDesignMatrix(X=preprocessed_X,
                                    preprocessor=preprocessor,
                                    fit_preprocessor=True)

        preprocessed_X = dataset.get_design_matrix()

        assert_allclose(X, preprocessor.inverse(preprocessed_X))
Пример #26
0
    def test(store_inverse):
        rng = np.random.RandomState([1, 2, 3])
        X = as_floatX(rng.randn(15, 10))
        preprocessed_X = copy.copy(X)
        preprocessor = ZCA(store_inverse=store_inverse)

        dataset = DenseDesignMatrix(X=preprocessed_X,
                                    preprocessor=preprocessor,
                                    fit_preprocessor=True)

        preprocessed_X = dataset.get_design_matrix()

        assert_allclose(X, preprocessor.inverse(preprocessed_X))
     def convert_to_dataset(X,y):            
         X = np.vstack(X);
         y = np.vstack(y);
         
         # convert labels
         y = self.label_converter.get_labels(y, self.label_mode);
         y = np.hstack(y);
         
         one_hot_y = one_hot(y);
         
         dataset = DenseDesignMatrix(X=X, y=one_hot_y);
         dataset.labels = y; # for confusion matrix
 
         return dataset;
Пример #28
0
        def make_dataset(num_batches):
            disturb_mem.disturb_mem()
            m = num_batches*batch_size
            X = rng.randn(m, num_features)
            y = np.zeros((m,1))
            y[:,0] = np.dot(X, w) > 0.

            rval =  DenseDesignMatrix(X=X, y=y)

            rval.yaml_src = "" # suppress no yaml_src warning

            X = rval.get_batch_design(batch_size)
            assert X.shape == (batch_size, num_features)

            return rval
Пример #29
0
            def convert_to_dataset(X, y):
                X = np.vstack(X)
                y = np.vstack(y)

                # convert labels
                y = self.label_converter.get_labels(y, self.label_mode)
                y = np.hstack(y)

                one_hot_y = one_hot(y)

                dataset = DenseDesignMatrix(X=X, y=one_hot_y)
                dataset.labels = y
                # for confusion matrix

                return dataset
Пример #30
0
        def make_dataset(num_batches):
            disturb_mem.disturb_mem()
            m = num_batches * batch_size
            X = rng.randn(m, num_features)
            y = np.zeros((m, 1))
            y[:, 0] = np.dot(X, w) > 0.

            rval = DenseDesignMatrix(X=X, y=y)

            rval.yaml_src = ""  # suppress no yaml_src warning

            X = rval.get_batch_design(batch_size)
            assert X.shape == (batch_size, num_features)

            return rval
Пример #31
0
def testing_multiple_datasets_in_monitor_based_lr():
    # tests that the class MonitorBasedLRAdjuster in sgd.py does not take multiple datasets in which multiple channels ending in '_objectives' exist. 
    # This case happens when the user has not specified either channel_name or dataset_name in the constructor

    dim = 3
    m = 10

    rng = np.random.RandomState([06,02,2014])

    X = rng.randn(m, dim)
    Y = rng.randn(m, dim)

    learning_rate = 1e-2
    batch_size = 5

    # We need to include this so the test actually stops running at some point
    epoch_num = 1

    # including a monitoring datasets lets us test that
    # the monitor works with supervised data
    monitoring_train = DenseDesignMatrix(X=X)
    monitoring_test = DenseDesignMatrix(X=Y)

    cost = DummyCost()

    model = SoftmaxModel(dim)

    dataset = DenseDesignMatrix(X=X)

    termination_criterion = EpochCounter(epoch_num)

    algorithm = SGD(learning_rate, cost, batch_size=5,
             monitoring_batches=2, monitoring_dataset= {'train': monitoring_train, 'test' : monitoring_test},
             termination_criterion=termination_criterion, update_callbacks=None,
             init_momentum = None, set_batch_size = False)

    monitor_lr = MonitorBasedLRAdjuster()

    train = Train(dataset, model, algorithm, save_path=None,
             save_freq=0, extensions=[monitor_lr])

    try:
        train.main_loop()
    except ValueError:
        return
        
    raise AssertionError("MonitorBasedLRAdjuster takes multiple dataset names in which more than one \"objective\" channel exist and the user has not specified " + 
        "either channel_name or database_name in the constructor to disambiguate.")
    def __init__(self,
                 raw,
                 transformer,
                 cpu_only=False,
                 space_preserving=False,
                 block_length=1):
        """
            .. todo::

                WRITEME properly

            Parameters
            ----------
            raw : pylearn2 Dataset
                Provides raw data
            transformer: pylearn2 Block
                To transform the data
            block_length: timeseries length
                Amount of elements of the timeseries
        """
        assert block_length >= 1

        if block_length != 1:
            timeseries = Timeseries(X=raw, block_length=block_length)
            super(TimeseriesTransformerDataset,
                  self).__init__(timeseries, transformer, cpu_only,
                                 space_preserving)
        else:
            raw = DenseDesignMatrix(X=raw)
            super(TimeseriesTransformerDataset,
                  self).__init__(raw, transformer, cpu_only, space_preserving)
Пример #33
0
def load_data(start, stop):
    # Loads the 1 million images into X and creates a DenseDesignMatrix
    # for use in a Denoising Autoencoder which is later used in a sDAE.
    # Returns: dataset: DenseDesignMatrix(start, stop)
    #dataset_location = "~/catkin_ws/src/athomesoftware/datasets/tinyimages/"
    #dataset_location = "~/catkin_ws/src/athomesoftware/datasets/cifar10/"
    X = []
    y = []

    print("Loading images from " + dataset_location)
    for images in os.walk(dataset_location):
        if (images.endswith('.png')):
            im = Image.open(images)
            im.reshape(3, 32, 32)
            row = list(im.getData())
            X.append(row)

    print("Images loaded from " + dataset_location)
    X = np.asarray(X)
    y = np.asarray(y)
    y = y.reshape(y.shape[0], 1)
    X = X[start:stop, :]
    y = y[start:stop, :]

    print("Creating design matrix " + dataset_location)
    return DenseDesignMatrix(X=X, y=y)
Пример #34
0
def test_execution_order():

    # ensure save is called directly after monitoring by checking
    # parameter values in `on_monitor` and `on_save`.

    model = MLP(layers=[Softmax(layer_name='y', n_classes=2, irange=0.)],
                nvis=3)

    dataset = DenseDesignMatrix(X=np.random.normal(size=(6, 3)),
                                y=np.random.normal(size=(6, 2)))

    epoch_counter = EpochCounter(max_epochs=1)

    algorithm = SGD(batch_size=2,
                    learning_rate=0.1,
                    termination_criterion=epoch_counter)

    extension = ParamMonitor()

    train = Train(dataset=dataset,
                  model=model,
                  algorithm=algorithm,
                  extensions=[extension],
                  save_freq=1,
                  save_path="save.pkl")

    # mock save
    train.save = MethodType(only_run_extensions, train)

    train.main_loop()
Пример #35
0
 def setup(self):
     """
     We use a small predefined 8x5 matrix for
     which we know the ZCA transform.
     """
     self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0],
                        [7.0, 26.0, 26.0, 26.0, -3.0],
                        [17.0, -17.0, -37.0, -36.0, -11.0],
                        [19.0, 15.0, -2.0, 5.0, 9.0],
                        [-3.0, -8.0, -35.0, -25.0, -8.0],
                        [-18.0, 3.0, 4.0, 15.0, 14.0],
                        [5.0, -4.0, -5.0, -7.0, -11.0],
                        [23.0, 22.0, 15.0, 20.0, 12.0]])
     self.dataset = DenseDesignMatrix(X=as_floatX(self.X),
                                      y=as_floatX(np.ones((8, 1))))
     self.num_components = self.dataset.get_design_matrix().shape[1] - 1
Пример #36
0
def load(start, stop, datadir='data/CK'):
    im_list = glob.glob(os.path.join(datadir, 'faces_aligned/*.png'))[start:]
    if not im_list:
        msg = ('No image files found in: %s' %
               os.path.realpath(os.path.join(datadir, 'faces_aligned')))
        log.error(msg)
        raise RuntimeException(msg)
    X = []
    y = []
    more_to_read = stop - start
    for im_file in im_list:
        if more_to_read <= 0:
            break
        label_base_pat = os.path.basename(im_file)[:9] + '*_emotion.txt'
        maybe_label_file = glob.glob(
            os.path.join(datadir, 'labels', label_base_pat))
        if maybe_label_file:
            y.append(read_label(maybe_label_file[0]))
            imdata = imread(im_file, False)
            imdata = cv2.resize(imdata, (32, 32))
            imdata = imdata.flatten().astype(np.float32) / 255
            X.append(imdata)
            more_to_read -= 1
    return DenseDesignMatrix(X=np.asarray(X),
                             y=np.asarray(y).reshape(-1, 1),
                             view_converter=DefaultViewConverter(
                                 (32, 32, 1), axes=('b', 0, 1, 'c')))
Пример #37
0
def random_dense_design_matrix(rng, num_examples, dim, num_classes):
    """
    Creates a random dense design matrix that has class labels.

    Parameters
    ----------
    rng : numpy.random.RandomState
        The random number generator used to generate the dataset.
    num_examples : int
        The number of examples to create.
    dim : int
        The number of features in each example.
    num_classes : int
        The number of classes to assign the examples to.
        0 indicates that no class labels will be generated.
    """
    X = rng.randn(num_examples, dim)

    if num_classes:
        Y = rng.randint(0, num_classes, (num_examples, 1))
        y_labels = num_classes
    else:
        Y = None
        y_labels = None

    return DenseDesignMatrix(X=X, y=Y, y_labels=y_labels)
Пример #38
0
def test_serialization_guard():

    # tests that Train refuses to serialize the dataset

    dim = 2
    m = 11

    rng = np.random.RandomState([28, 9, 2012])
    X = rng.randn(m, dim)
    dataset = DenseDesignMatrix(X=X)

    model = DummyModel(dim)
    # make the dataset part of the model, so it will get
    # serialized
    model.dataset = dataset

    Monitor.get_monitor(model)

    algorithm = DummyAlgorithm()

    train = Train(dataset,
                  model,
                  algorithm,
                  save_path='_tmp_unit_test.pkl',
                  save_freq=1,
                  callbacks=None)

    try:
        train.main_loop()
    except RuntimeError:
        return
    assert False  # train did not complain, this is a bug
Пример #39
0
def monary_load(start=0, stop=-1, find_args={}, species_to_retrieve=[]):
    if species_to_retrieve == []:
        species_to_retrieve = species
    else:
        species_to_retrieve = [s for s in species_to_retrieve if s in species]
    query = {}
    for s in species_to_retrieve:
        query[s] = {"$gt": 0}
    find_args["$or"] = [{k: query[k]} for k in query.keys()]
    with Monary("127.0.0.1") as monary:
        out = monary.query(
            "creeval",
            collection,
            find_args,
            num_metadata + cat_metadata + species_to_retrieve, ["float32"] *
            (len(num_metadata) + len(cat_metadata) + len(species_to_retrieve)),
            limit=(stop - start),
            offset=start)
    for i, col in enumerate(out[0:len(num_metadata + cat_metadata)]):
        out[i] = np.ma.filled(col, np.ma.mean(col))
        #if any(np.isnan(col)):
        #	print col
    out = np.ma.row_stack(out).T
    X = out[:, 0:len(num_metadata + cat_metadata)]
    y = out[:, len(num_metadata + cat_metadata):]
    y = (y > 0).astype(int)

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    pickle.dump(scaler, open(collection + "_scaler.pkl", "wb"))
    y = np.asarray(y)

    return DenseDesignMatrix(X=X, y=y)
Пример #40
0
def array_to_ds(X):
    """
    Build a DenseDesignMatrix with topo_view using X.
    X: a nsamples x pixels numpy array, or a list of linearized images
    """
    if type(X) is list:
        X = np.asarray(X)
    return DenseDesignMatrix(topo_view=X.reshape(X.shape + (1,)))
Пример #41
0
    def test_zero_image(self):
        """
        Test on zero-value image if cause any division by zero
        """

        X = as_floatX(np.zeros((5, 32 * 32 * 3)))

        axes = ['b', 0, 1, 'c']
        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)
        dataset = DenseDesignMatrix(X=X, view_converter=view_converter)
        dataset.axes = axes
        preprocessor = LeCunLCN(img_shape=[32, 32])
        dataset.apply_preprocessor(preprocessor)
        result = dataset.get_design_matrix()

        assert isfinite(result)
Пример #42
0
    def test_zero_vector(self):
        """ Test that passing in the zero vector does not result in
            a divide by 0 """

        dataset = DenseDesignMatrix(X=as_floatX(np.zeros((1, 1))))

        # the settings of subtract_mean and use_norm are not relevant to
        # the test
        # std_bias = 0.0 is the only value for which there should be a risk
        # of failure occurring
        preprocessor = GlobalContrastNormalization(subtract_mean=True, sqrt_bias=0.0, use_std=True)

        dataset.apply_preprocessor(preprocessor)

        result = dataset.get_design_matrix()

        assert not np.any(np.isnan(result))
        assert not np.any(np.isinf(result))
Пример #43
0
    def test_channel(self):
        """
        Test if works fine withe different number of channel as argument
        """

        rng = np.random.RandomState([1, 2, 3])
        X = as_floatX(rng.randn(5, 32 * 32 * 3))

        axes = ['b', 0, 1, 'c']
        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)
        dataset = DenseDesignMatrix(X=X, view_converter=view_converter)
        dataset.axes = axes
        preprocessor = LeCunLCN(img_shape=[32, 32], channels=[1, 2])
        dataset.apply_preprocessor(preprocessor)
        result = dataset.get_design_matrix()

        assert isfinite(result)
Пример #44
0
def test_finitedataset_source_check():
    """
    Check that the FiniteDatasetIterator returns sensible
    errors when there is a missing source in the dataset.
    """
    dataset = DenseDesignMatrix(X=np.random.rand(20,15).astype(theano.config.floatX),
                                y=np.random.rand(20,5).astype(theano.config.floatX))
    assert_raises(ValueError,
                  dataset.iterator,
                  mode='sequential',
                  batch_size=5,
                  data_specs=(VectorSpace(15),'featuresX'))
    try:
        dataset.iterator(mode='sequential',
                         batch_size=5,
                         data_specs=(VectorSpace(15),'featuresX'))
    except ValueError as e:
        assert 'featuresX' in str(e)
Пример #45
0
def test_rgb_yuv():
    """
    Test on a random image if the per-processor loads and works without
    anyerror and doesn't result in any nan or inf values

    """

    rng = np.random.RandomState([1, 2, 3])
    X = as_floatX(rng.randn(5, 32 * 32 * 3))

    axes = ['b', 0, 1, 'c']
    view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                              axes)
    dataset = DenseDesignMatrix(X=X, view_converter=view_converter)
    dataset.axes = axes
    preprocessor = RGB_YUV()
    dataset.apply_preprocessor(preprocessor)
    result = dataset.get_design_matrix()

    assert isfinite(result)
Пример #46
0
    def test_random_image(self):
        """
        Test on a random image if the per-processor loads and works without
        anyerror and doesn't result in any nan or inf values

        """

        rng = np.random.RandomState([1, 2, 3])
        X = as_floatX(rng.randn(5, 32 * 32 * 3))

        axes = ["b", 0, 1, "c"]
        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes)
        dataset = DenseDesignMatrix(X=X, view_converter=view_converter)
        dataset.axes = axes
        preprocessor = LeCunLCN(img_shape=[32, 32])
        dataset.apply_preprocessor(preprocessor)
        result = dataset.get_design_matrix()

        assert not np.any(np.isnan(result))
        assert not np.any(np.isinf(result))
Пример #47
0
def test_split_nfold_datasets():
    #Load and create ddm from cifar100
    path = "/data/lisa/data/cifar100/cifar-100-python/train"
    obj = serial.load(path)
    X = obj['data']

    assert X.max() == 255.
    assert X.min() == 0.

    X = np.cast['float32'](X)
    y = None #not implemented yet

    view_converter = DefaultViewConverter((32,32,3))

    ddm = DenseDesignMatrix(X = X, y =y, view_converter = view_converter)

    assert not np.any(np.isnan(ddm.X))
    ddm.y_fine = np.asarray(obj['fine_labels'])
    ddm.y_coarse = np.asarray(obj['coarse_labels'])
    folds = ddm.split_dataset_nfolds(10)
    print folds[0].shape
Пример #48
0
def test_extract_reassemble():
    """ Tests that ExtractGridPatches and ReassembleGridPatches are
    inverse of each other """

    rng = np.random.RandomState([1, 3, 7])

    topo = rng.randn(4, 3 * 5, 3 * 7, 2)

    dataset = DenseDesignMatrix(topo_view=topo)

    patch_shape = (3, 7)

    extractor = ExtractGridPatches(patch_shape, patch_shape)
    reassemblor = ReassembleGridPatches(patch_shape=patch_shape,
                                        orig_shape=topo.shape[1:3])

    dataset.apply_preprocessor(extractor)
    dataset.apply_preprocessor(reassemblor)

    new_topo = dataset.get_topological_view()

    assert new_topo.shape == topo.shape

    if not np.all(new_topo == topo):
        assert False
def test_split_datasets():
    #Load and create ddm from cifar100
    path = "/data/lisa/data/cifar100/cifar-100-python/train"
    obj = serial.load(path)
    X = obj['data']

    assert X.max() == 255.
    assert X.min() == 0.

    X = np.cast['float32'](X)
    y = None #not implemented yet

    view_converter = DefaultViewConverter((32,32,3))

    ddm = DenseDesignMatrix(X = X, y =y, view_converter = view_converter)

    assert not np.any(np.isnan(ddm.X))
    ddm.y_fine = np.asarray(obj['fine_labels'])
    ddm.y_coarse = np.asarray(obj['coarse_labels'])
    (train, valid) = ddm.split_dataset_holdout(train_prop=0.5)
    assert valid.shape[0] == np.ceil(ddm.num_examples * 0.5)
    assert train.shape[0] == (ddm.num_examples - valid.shape[0])
Пример #50
0
 def __init__(self, filename, X=None, topo_view=None, y=None,
              load_all=False, **kwargs):
     if 'preprocessor' in kwargs:
         if ('fit_preprocessor' in kwargs and 
             kwargs['fit_preprocessor'] is False) or ('fit_preprocessor' 
                                                      not in kwargs):
             self._preprocessor = kwargs['preprocessor']
             kwargs['preprocessor'] = None
     else:
         self._preprocessor = None
     self.load_all = load_all
     if h5py is None:
         raise RuntimeError("Could not import h5py.")
     self._file = h5py.File(filename)
     if X is not None:
         X = self.get_dataset(X, load_all)
     if topo_view is not None:
         topo_view = self.get_dataset(topo_view, load_all)
     if y is not None:
         y = self.get_dataset(y, load_all)
     DenseDesignMatrix.__init__(self, X=X, topo_view=topo_view, y=y,
                                **kwargs)
Пример #51
0
 def setup(self):
     """
     We use a small predefined 8x5 matrix for
     which we know the ZCA transform.
     """
     self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0],
                       [7.0, 26.0, 26.0, 26.0, -3.0],
                       [17.0, -17.0, -37.0, -36.0, -11.0],
                       [19.0, 15.0, -2.0, 5.0, 9.0],
                       [-3.0, -8.0, -35.0, -25.0, -8.0],
                       [-18.0, 3.0, 4.0, 15.0, 14.0],
                       [5.0, -4.0, -5.0, -7.0, -11.0],
                       [23.0, 22.0, 15.0, 20.0, 12.0]])
     self.dataset = DenseDesignMatrix(X=as_floatX(self.X),
                                      y=as_floatX(np.ones((8, 1))))
     self.num_components = self.dataset.get_design_matrix().shape[1] - 1
Пример #52
0
def test_init_with_X_or_topo():
    # tests that constructing with topo_view works
    # tests that construction with design matrix works
    # tests that conversion from topo_view to design matrix and back works
    # tests that conversion the other way works too
    rng = np.random.RandomState([1, 2, 3])
    topo_view = rng.randn(5, 2, 2, 3)
    d1 = DenseDesignMatrix(topo_view=topo_view)
    X = d1.get_design_matrix()
    d2 = DenseDesignMatrix(X=X, view_converter=d1.view_converter)
    topo_view_2 = d2.get_topological_view()
    assert np.allclose(topo_view, topo_view_2)
    X = rng.randn(*X.shape)
    topo_view_3 = d2.get_topological_view(X)
    X2 = d2.get_design_matrix(topo_view_3)
    assert np.allclose(X, X2)
Пример #53
0
    def __init__(self,
                 patient_id,
                 which_set,
                 list_features,
                 leave_out_seizure_idx_valid,
                 leave_out_seizure_idx_test,
                 data_dir,
                 preictal_sec,
                 use_all_nonictals,
                 preprocessor_dir,
                 n_selected_features=-1,
                 batch_size=None,
                 balance_class=True,
                 axes=('b', 0, 1, 'c'),
                 default_seed=0):

        self.balance_class = balance_class
        self.batch_size = batch_size

        tmp_list_features = np.empty(len(list_features), dtype=object)
        for f_idx in range(len(list_features)):
            tmp_list_features[f_idx] = FeatureList.get_info(list_features[f_idx])
        list_features = tmp_list_features

        print 'List of features:'
        for f in list_features:
            print f['feature'] + '.' + f['param']
        print ''

        EpilepsiaeFeatureLoader.__init__(self,
                                         patient_id=patient_id,
                                         which_set=which_set,
                                         list_features=list_features,
                                         leave_out_seizure_idx_valid=leave_out_seizure_idx_valid,
                                         leave_out_seizure_idx_test=leave_out_seizure_idx_test,
                                         data_dir=data_dir,
                                         preictal_sec=preictal_sec,
                                         use_all_nonictals=use_all_nonictals)
        # Row: samples, Col: features
        raw_X, y = self.load_data()

        if n_selected_features != -1:
            all_rank_df = None
            for f_idx, feature in enumerate(self.list_features):
                rank_df = pd.read_csv(os.path.join(data_dir, patient_id +
                                                 '/rank_feature_idx_' + feature['param'] + '_' +
                                                 'leaveout_' + str(leave_out_seizure_idx_valid) + '_' +
                                                 str(leave_out_seizure_idx_test) + '.txt'))
                if f_idx == 0:
                    all_rank_df = rank_df
                else:
                    offset_f_idx = 0
                    for i in range(f_idx):
                        offset_f_idx = offset_f_idx + self.list_features[i]['n_features']
                    rank_df['feature_idx'] = rank_df['feature_idx'].values + offset_f_idx
                    all_rank_df = pd.concat([all_rank_df, rank_df])

            sorted_feature_df = all_rank_df.sort(['D_ADH'], ascending=[0])
            self.selected_feature_idx = sorted_feature_df['feature_idx'][:n_selected_features]
            raw_X = raw_X[:, self.selected_feature_idx]
        else:
            self.selected_feature_idx = np.arange(raw_X.shape[1])

        # Print shape of input data
        print '------------------------------'
        print 'Dataset: {0}'.format(self.which_set)
        print 'Number of samples: {0}'.format(raw_X.shape[0])
        print ' Preictal samples: {0}'.format(self.preictal_samples)
        print ' Nonictal samples: {0}'.format(self.nonictal_samples)
        print ' NaN samples: {0}'.format(self.nan_non_flat_samples)
        print ' Note for ''train'' and ''valid_train'': number of samples will be equal without removing the nan samples.'
        print 'Number of features: {0}'.format(raw_X.shape[1])
        print '------------------------------'

        # Preprocessing
        if which_set == 'train':
            scaler = preprocessing.StandardScaler()
            # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler = scaler.fit(raw_X)

            with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' +
                                                     str(self.leave_out_seizure_idx_valid) + '_' +
                                                     str(self.leave_out_seizure_idx_test) + '.pkl'), 'wb') as f:
                pickle.dump(scaler, f)

            preprocessed_X = scaler.transform(raw_X)
        else:
            with open(os.path.join(preprocessor_dir, self.patient_id + '_scaler_feature_' +
                                                     str(self.leave_out_seizure_idx_valid) + '_' +
                                                     str(self.leave_out_seizure_idx_test) + '.pkl'), 'rb') as f:
                scaler = pickle.load(f)

            preprocessed_X = scaler.transform(raw_X)

        raw_X = None

        if self.which_set == 'train' or self.which_set == 'valid_train':
            # Shuffle the data
            print ''
            print '*** Shuffle data ***'
            print ''
            permute_idx = np.random.permutation(preprocessed_X.shape[0])
            preprocessed_X = preprocessed_X[permute_idx, :]
            y = y[permute_idx, :]

        if self.balance_class and (self.which_set == 'train' or self.which_set == 'valid_train'):
            self.X_full = preprocessed_X
            self.y_full = y

            (X, y) = self.get_data()
        else:
            # Zero-padding (if necessary)
            if not (self.batch_size is None):
                preprocessed_X, y = self.zero_pad(preprocessed_X, y, self.batch_size)

            X = preprocessed_X

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self,
                                   X=X,
                                   y=y,
                                   axes=axes)
Пример #54
0
k = 3

X = np.zeros((m*k,patch_shape[0]*patch_shape[1]*3),dtype='float32')

rng = np.random.RandomState([1,2,3])

for i, img_path in enumerate(ImageIterator(path, suffix=".npy")):

    img = np.load(img_path)

    if img.shape[2] == 1:
        img = np.concatenate((img,img,img),axis=2)

    img = img.reshape(1,img.shape[0],img.shape[1],img.shape[2])

    d = DenseDesignMatrix( topo_view = img, view_converter = DefaultViewConverter(img.shape[1:]) )

    random_rng = np.random.RandomState([ rng.randint(0,256), rng.randint(0,256), rng.randint(0,256)])

    p = ExtractPatches( patch_shape = patch_shape, num_patches = k , rng = random_rng)

    d.apply_preprocessor(p)

    X[i*3:(i+1)*3,:] = d.X

d.X = X

base = '/data/lisatmp/goodfeli/darpa_imagenet_patch_%dx%d_train.' % (patch_shape[0], patch_shape[1])

d.use_design_loc(base+'npy')
serial.save(base+'pkl',d)
Пример #55
0
    if feature_type == 'exp_hs':
        feat = H * Mu1
    elif feature_type == 'exp_h':
        feat = H
    elif feature_type == 'map_hs':
        feat = ( H > 0.5) * Mu1
    else:
        assert False

    print 'compiling theano function'
    f = function([V],feat)

    print 'running theano function'
    feat = f(X2)

    feat_dataset = DenseDesignMatrix(X = feat, view_converter = DefaultViewConverter([1, 1, feat.shape[1]] ) )

    print 'reassembling features'
    ns = 32 - size + 1
    depatchifier = ReassembleGridPatches( orig_shape  = (ns, ns), patch_shape=(1,1) )
    feat_dataset.apply_preprocessor(depatchifier)

    print 'making topological view'
    topo_feat = feat_dataset.get_topological_view()
    assert topo_feat.shape[0] == X.shape[0]

    print 'assembling visualizer'

    n = np.ceil(np.sqrt(model.nhid))

    pv3 = PatchViewer(grid_shape = (X.shape[0], num_filters), patch_shape=(ns,ns), is_color= False)
Пример #56
0
 def setup(self):
     rng = np.random.RandomState([1, 2, 3])
     self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)),
                                      y=as_floatX(rng.randn(15, 1)))
     self.num_components = self.dataset.get_design_matrix().shape[1] - 1
Пример #57
0
class testZCA:

    def setup(self):
        """
        We use a small predefined 8x5 matrix for
        which we know the ZCA transform.
        """
        self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0],
                          [7.0, 26.0, 26.0, 26.0, -3.0],
                          [17.0, -17.0, -37.0, -36.0, -11.0],
                          [19.0, 15.0, -2.0, 5.0, 9.0],
                          [-3.0, -8.0, -35.0, -25.0, -8.0],
                          [-18.0, 3.0, 4.0, 15.0, 14.0],
                          [5.0, -4.0, -5.0, -7.0, -11.0],
                          [23.0, 22.0, 15.0, 20.0, 12.0]])
        self.dataset = DenseDesignMatrix(X=as_floatX(self.X),
                                         y=as_floatX(np.ones((8, 1))))
        self.num_components = self.dataset.get_design_matrix().shape[1] - 1

    def get_preprocessed_data(self, preprocessor):
        X = copy.copy(self.X)
        dataset = DenseDesignMatrix(X=X,
                                    preprocessor=preprocessor,
                                    fit_preprocessor=True)
        return dataset.get_design_matrix()

    def test_zca(self):
        """
        Confirm that ZCA.inv_P_ is the correct inverse of ZCA.P_.
        There's a lot else about the ZCA class that could be tested here.
        """
        preprocessor = ZCA()
        preprocessor.fit(self.X)

        identity = np.identity(self.X.shape[1], theano.config.floatX)
        # Check some basics of transformation matrix
        assert preprocessor.P_.shape == (self.X.shape[1], self.X.shape[1])
        assert_allclose(np.dot(preprocessor.P_,
                               preprocessor.inv_P_), identity, rtol=1e-4)

        preprocessor = ZCA(filter_bias=0.0)
        preprocessed_X = self.get_preprocessed_data(preprocessor)

        # Check if preprocessed data matrix is white
        assert_allclose(np.cov(preprocessed_X.transpose(),
                               bias=1), identity, rtol=1e-4)

        # Check if we obtain correct solution
        zca_transformed_X = np.array(
            [[-1.0199, -0.1832, 1.9528, -0.9603, -0.8162],
             [0.0729, 1.4142, 0.2529, 1.1861, -1.0876],
             [0.9575, -1.1173, -0.5435, -1.4372, -0.1057],
             [0.6348, 1.1258, 0.2692, -0.8893, 1.1669],
             [-0.9769, 0.8297, -1.8676, -0.6055, -0.5096],
             [-1.5700, -0.8389, -0.0931, 0.8877, 1.6089],
             [0.4993, -1.4219, -0.3443, 0.9664, -1.1022],
             [1.4022, 0.1917, 0.3736, 0.8520, 0.8456]]
        )
        assert_allclose(preprocessed_X, zca_transformed_X, rtol=1e-3)

    def test_num_components(self):
        # Keep 3 components
        preprocessor = ZCA(filter_bias=0.0, n_components=3)
        preprocessed_X = self.get_preprocessed_data(preprocessor)

        zca_truncated_X = np.array(
            [[-0.8938, -0.3084, 1.1105, 0.1587, -1.4073],
             [0.3346, 0.5193, 1.1371, 0.6545, -0.4199],
             [0.7613, -0.4823, -1.0578, -1.1997, -0.4993],
             [0.9250, 0.5012, -0.2743, 0.1735, 0.8105],
             [-0.4928, -0.6319, -1.0359, -0.7173, 0.1469],
             [-1.8060, -0.1758, -0.2943, 0.7208, 1.4359],
             [0.0079, -0.2582, 0.1368, -0.3571, -0.8147],
             [1.1636, 0.8362, 0.2777, 0.5666, 0.7480]]
        )
        assert_allclose(zca_truncated_X, preprocessed_X, rtol=1e-3)

        # Drop 2 components: result should be similar
        preprocessor = ZCA(filter_bias=0.0, n_drop_components=2)
        preprocessed_X = self.get_preprocessed_data(preprocessor)
        assert_allclose(zca_truncated_X, preprocessed_X, rtol=1e-3)

    def test_zca_inverse(self):
        """
        Calculates the inverse of X with numpy.linalg.inv
        if inv_P_ is not stored.
        """
        def test(store_inverse):
            preprocessed_X = copy.copy(self.X)
            preprocessor = ZCA(store_inverse=store_inverse)

            dataset = DenseDesignMatrix(X=preprocessed_X,
                                        preprocessor=preprocessor,
                                        fit_preprocessor=True)

            preprocessed_X = dataset.get_design_matrix()
            assert_allclose(self.X, preprocessor.inverse(preprocessed_X))

        test(store_inverse=True)
        test(store_inverse=False)

    def test_zca_dtypes(self):
        """
        Confirm that ZCA.fit works regardless of dtype of
        data and config.floatX
        """

        orig_floatX = config.floatX

        try:
            for floatX in ['float32', 'float64']:
                for dtype in ['float32', 'float64']:
                    preprocessor = ZCA()
                    preprocessor.fit(self.X)
        finally:
            config.floatX = orig_floatX
Пример #58
0
 def get_preprocessed_data(self, preprocessor):
     X = copy.copy(self.X)
     dataset = DenseDesignMatrix(X=X,
                                 preprocessor=preprocessor,
                                 fit_preprocessor=True)
     return dataset.get_design_matrix()