Пример #1
0
    def test_correct_dimensions_order(self, clf):
        """To check if known/present Classifiers are working properly
        with samples being first dimension. Started to worry about
        possible problems while looking at sg where samples are 2nd
        dimension
        """
        # specially crafted dataset -- if dimensions are flipped over
        # the same storage, problem becomes unseparable. Like in this case
        # incorrect order of dimensions lead to equal samples [0, 1, 0]
        traindatas = [
            dataset_wizard(samples=np.array([ [0, 0, 1.0],
                                        [1, 0, 0] ]), targets=[0, 1]),
            dataset_wizard(samples=np.array([ [0, 0.0],
                                      [1, 1] ]), targets=[0, 1])]

        clf.ca.change_temporarily(enable_ca = ['training_stats'])
        for traindata in traindatas:
            clf.train(traindata)
            self.failUnlessEqual(clf.ca.training_stats.percent_correct, 100.0,
                "Classifier %s must have 100%% correct learning on %s. Has %f" %
                (`clf`, traindata.samples, clf.ca.training_stats.percent_correct))

            # and we must be able to predict every original sample thus
            for i in xrange(traindata.nsamples):
                sample = traindata.samples[i,:]
                predicted = clf.predict([sample])
                self.failUnlessEqual([predicted], traindata.targets[i],
                    "We must be able to predict sample %s using " % sample +
                    "classifier %s" % `clf`)
        clf.ca.reset_changed_temporarily()
Пример #2
0
    def test_correct_dimensions_order(self, clf):
        """To check if known/present Classifiers are working properly
        with samples being first dimension. Started to worry about
        possible problems while looking at sg where samples are 2nd
        dimension
        """
        # specially crafted dataset -- if dimensions are flipped over
        # the same storage, problem becomes unseparable. Like in this case
        # incorrect order of dimensions lead to equal samples [0, 1, 0]
        traindatas = [
            dataset_wizard(samples=np.array([[0, 0, 1.0], [1, 0, 0]]),
                           targets=[-1, 1]),
            dataset_wizard(samples=np.array([[0, 0.0], [1, 1]]),
                           targets=[-1, 1])
        ]

        clf.ca.change_temporarily(enable_ca=['training_confusion'])
        for traindata in traindatas:
            clf.train(traindata)
            self.failUnlessEqual(
                clf.ca.training_confusion.percent_correct, 100.0,
                "Classifier %s must have 100%% correct learning on %s. Has %f"
                % ( ` clf `, traindata.samples,
                    clf.ca.training_confusion.percent_correct))

            # and we must be able to predict every original sample thus
            for i in xrange(traindata.nsamples):
                sample = traindata.samples[i, :]
                predicted = clf.predict([sample])
                self.failUnlessEqual(
                    [predicted], traindata.targets[i],
                    "We must be able to predict sample %s using " % sample +
                    "classifier %s" % ` clf `)
        clf.ca.reset_changed_temporarily()
Пример #3
0
    def test_feature_selection_classifier(self):
        from mvpa.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa.featsel.helpers import \
             FixedNElementTailSelector

        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()
        # should give lowest weight to the feature with highest index
        sens_ana_rev = SillySensitivityAnalyzer(mult=-1)

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        feat_sel_rev = SensitivityBasedFeatureSelection(
            sens_ana_rev, FixedNElementTailSelector(1))

        samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1],
                            [1, -1, 1]])

        testdata3 = dataset_wizard(samples=samples, targets=1)
        # dummy train data so proper mapper gets created
        traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]),
                                   targets=[1, 2])

        # targets
        res110 = [1, 1, 1, -1, -1]
        res011 = [-1, 1, -1, 1, -1]

        # first classifier -- 0th feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign,
                                            feat_sel,
                                            enable_ca=['feature_ids'])

        self.clf_sign.ca.change_temporarily(enable_ca=['estimates'])
        clf011.train(traindata)

        self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
        # just silly test if we get values assigned in the 'ProxyClassifier'
        self.failUnless(len(clf011.ca.estimates) == len(res110),
                        msg="We need to pass values into ProxyClassifier")
        self.clf_sign.ca.reset_changed_temporarily()

        self.failUnlessEqual(len(clf011.ca.feature_ids), 2)
        "Feature selection classifier had to be trained on 2 features"

        # first classifier -- last feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
        clf011.train(traindata)
        self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
Пример #4
0
    def test_coarsen_chunks(self):
        """Just basic testing for now"""
        chunks = [1,1,2,2,3,3,4,4]
        ds = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=chunks)
        coarsen_chunks(ds, nchunks=2)
        chunks1 = coarsen_chunks(chunks, nchunks=2)
        self.failUnless((chunks1 == ds.chunks).all())
        self.failUnless((chunks1 == np.asarray([0,0,0,0,1,1,1,1])).all())

        ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=range(len(chunks)))
        coarsen_chunks(ds2, nchunks=2)
        self.failUnless((chunks1 == ds.chunks).all())
Пример #5
0
    def test_feature_selection_classifier(self):
        from mvpa.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa.featsel.helpers import \
             FixedNElementTailSelector

        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()
        # should give lowest weight to the feature with highest index
        sens_ana_rev = SillySensitivityAnalyzer(mult=-1)

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(sens_ana,
            FixedNElementTailSelector(1, mode='discard'))

        feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
            FixedNElementTailSelector(1))

        samples = np.array([ [0, 0, -1], [1, 0, 1], [-1, -1, 1],
                            [-1, 0, 1], [1, -1, 1] ])

        testdata3 = dataset_wizard(samples=samples, targets=1)
        # dummy train data so proper mapper gets created
        traindata = dataset_wizard(samples=np.array([ [0, 0, -1], [1, 0, 1] ]),
                            targets=[1, 2])

        # targets
        res110 = [1, 1, 1, -1, -1]
        res011 = [-1, 1, -1, 1, -1]

        # first classifier -- 0th feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
                    enable_ca=['feature_ids'])

        self.clf_sign.ca.change_temporarily(enable_ca=['estimates'])
        clf011.train(traindata)

        self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
        # just silly test if we get values assigned in the 'ProxyClassifier'
        self.failUnless(len(clf011.ca.estimates) == len(res110),
                        msg="We need to pass values into ProxyClassifier")
        self.clf_sign.ca.reset_changed_temporarily()

        self.failUnlessEqual(clf011.mapper._oshape, (2,))
        "Feature selection classifier had to be trained on 2 features"

        # first classifier -- last feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
        clf011.train(traindata)
        self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
Пример #6
0
def pure_multivariate_signal(patterns, signal2noise = 1.5, chunks=None, targets=[0, 1]):
    """ Create a 2d dataset with a clear multivariate signal, but no
    univariate information.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%
    """

    # start with noise
    data = np.random.normal(size=(4*patterns, 2))

    # add signal
    data[:2*patterns, 1] += signal2noise

    data[2*patterns:4*patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2*patterns:3*patterns, 0] -= signal2noise
    data[patterns:2*patterns, 0] += signal2noise
    data[3*patterns:4*patterns, 0] += signal2noise

    # two conditions
    regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) + (targets[0:1] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Пример #7
0
def pure_multivariate_signal(patterns, signal2noise = 1.5, chunks=None):
    """ Create a 2d dataset with a clear multivariate signal, but no
    univariate information.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%
    """

    # start with noise
    data = np.random.normal(size=(4*patterns, 2))

    # add signal
    data[:2*patterns, 1] += signal2noise

    data[2*patterns:4*patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2*patterns:3*patterns, 0] -= signal2noise
    data[patterns:2*patterns, 0] += signal2noise
    data[3*patterns:4*patterns, 0] += signal2noise

    # two conditions
    regs = np.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Пример #8
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(sens_ana,
            FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                      targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf((np.array(clf_reg.ca.estimates)
                     - clf_reg.ca.predictions).sum()==0,
                    msg="Values were set to the predictions in %s." %
                    sample_clf_reg)
Пример #9
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                             targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf(
            (np.array(clf_reg.ca.estimates) -
             clf_reg.ca.predictions).sum() == 0,
            msg="Values were set to the predictions in %s." % sample_clf_reg)
Пример #10
0
def dumb_feature_binary_dataset():
    """Very simple binary (2 labels) dataset
    """
    data = [
        [1, 0],
        [1, 1],
        [2, 0],
        [2, 1],
        [3, 0],
        [3, 1],
        [4, 0],
        [4, 1],
        [5, 0],
        [5, 1],
        [6, 0],
        [6, 1],
        [7, 0],
        [7, 1],
        [8, 0],
        [8, 1],
        [9, 0],
        [9, 1],
        [10, 0],
        [10, 1],
        [11, 0],
        [11, 1],
        [12, 0],
        [12, 1],
    ]
    regs = ([0] * 12) + ([1] * 12)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Пример #11
0
def dumb_feature_dataset():
    """Create a very simple dataset with 2 features and 3 labels
    """
    data = [
        [1, 0],
        [1, 1],
        [2, 0],
        [2, 1],
        [3, 0],
        [3, 1],
        [4, 0],
        [4, 1],
        [5, 0],
        [5, 1],
        [6, 0],
        [6, 1],
        [7, 0],
        [7, 1],
        [8, 0],
        [8, 1],
        [9, 0],
        [9, 1],
        [10, 0],
        [10, 1],
        [11, 0],
        [11, 1],
        [12, 0],
        [12, 1],
    ]
    regs = ([1] * 8) + ([2] * 8) + ([3] * 8)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Пример #12
0
def test_origid_handling():
    ds = dataset_wizard(np.atleast_2d(np.arange(35)).T)
    ds.init_origids('both')
    ok_(ds.nsamples == 35)
    assert_equal(len(np.unique(ds.sa.origids)), 35)
    assert_equal(len(np.unique(ds.fa.origids)), 1)
    selector = [3, 7, 10, 15]
    subds = ds[selector]
    assert_array_equal(subds.sa.origids, ds.sa.origids[selector])

    # Now if we request new origids if they are present we could
    # expect different behavior
    assert_raises(ValueError, subds.init_origids, 'both', mode='raises')
    sa_origids = subds.sa.origids.copy()
    fa_origids = subds.fa.origids.copy()
    for s in ('both', 'samples', 'features'):
        assert_raises(RuntimeError, subds.init_origids, s, mode='raise')
        subds.init_origids(s, mode='existing')
        # we should have the same origids as before
        assert_array_equal(subds.sa.origids, sa_origids)
        assert_array_equal(subds.fa.origids, fa_origids)

    # Lets now change, which should be default behavior
    subds.init_origids('both')
    assert_equal(len(sa_origids), len(subds.sa.origids))
    assert_equal(len(fa_origids), len(subds.fa.origids))
    # values should change though
    ok_((sa_origids != subds.sa.origids).any())
    ok_((fa_origids != subds.fa.origids).any())
Пример #13
0
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0, x_min=-2.0, x_max=3.0, sigma=0.2):
    """A straight line with some Gaussian noise.
    """
    x = np.linspace(start=x_min, stop=x_max, num=size)
    noise = np.random.randn(size) * sigma
    y = x * slope + intercept + noise
    return dataset_wizard(samples=x[:, None], targets=y)
Пример #14
0
def test_idhash():
    ds = dataset_wizard(np.arange(12).reshape((4, 3)),
                 targets=1, chunks=1)
    origid = ds.idhash
    #XXX BUG -- no assurance that labels would become an array... for now -- do manually
    ds.targets = np.array([3, 1, 2, 3])   # change all labels
    ok_(origid != ds.idhash,
                    msg="Changing all targets should alter dataset's idhash")

    origid = ds.idhash

    z = ds.targets[1]
    assert_equal(origid, ds.idhash,
                 msg="Accessing shouldn't change idhash")
    z = ds.chunks
    assert_equal(origid, ds.idhash,
                 msg="Accessing shouldn't change idhash")
    z[2] = 333
    ok_(origid != ds.idhash,
        msg="Changing value in attribute should change idhash")

    origid = ds.idhash
    ds.samples[1, 1] = 1000
    ok_(origid != ds.idhash,
        msg="Changing value in data should change idhash")

    origid = ds.idhash
    orig_labels = ds.targets #.copy()
    ds.permute_targets()
    ok_(origid != ds.idhash,
        msg="Permutation also changes idhash")

    ds.targets = orig_labels
    ok_(origid == ds.idhash,
        msg="idhash should be restored after reassigning orig targets")
Пример #15
0
    def test_aggregation(self):
        data = dataset_wizard(np.arange( 20 ).reshape((4, 5)), targets=1, chunks=1)

        ag_data = aggregate_features(data, np.mean)

        ok_(ag_data.nsamples == 4)
        ok_(ag_data.nfeatures == 1)
        assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
Пример #16
0
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0,
                            x_min=-2.0, x_max=3.0, sigma=0.2):
    """A straight line with some Gaussian noise.
    """
    x = np.linspace(start=x_min, stop=x_max, num=size)
    noise = np.random.randn(size)*sigma
    y = x * slope + intercept + noise
    return dataset_wizard(samples=x[:, None], targets=y)
Пример #17
0
    def setUp(self):
        self.clf_sign = SameSignClassifier()
        self.clf_less1 = Less1Classifier()

        # simple binary dataset
        self.data_bin_1 = dataset_wizard(
            samples=[[0, 0], [-10, -1], [1, 0.1], [1, -1], [-1, 1]],
            targets=[1, 1, 1, -1, -1],  # labels
            chunks=[0, 1, 2, 2, 3])  # chunks
Пример #18
0
    def setUp(self):
        self.clf_sign = SameSignClassifier()
        self.clf_less1 = Less1Classifier()

        # simple binary dataset
        self.data_bin_1 = dataset_wizard(
            samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
            targets=[1, 1, 1, -1, -1], # labels
            chunks=[0, 1, 2,  2, 3])  # chunks
Пример #19
0
def test_str():
    args = ( np.arange(12, dtype=np.int8).reshape((4, 3)),
             range(4),
             [1, 1, 2, 2])
    for iargs in range(1, len(args)):
        ds = dataset_wizard(*(args[:iargs]))
        ds_s = str(ds)
        ok_(ds_s.startswith('<Dataset: 4x3@int8'))
        ok_(ds_s.endswith('>'))
Пример #20
0
def dumb_feature_binary_dataset():
    """Very simple binary (2 labels) dataset
    """
    data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
            [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
            [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
            [12, 1]]
    regs = ([0] * 12) + ([1] * 12)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Пример #21
0
def dumb_feature_dataset():
    """Create a very simple dataset with 2 features and 3 labels
    """
    data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
            [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
            [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
            [12, 1]]
    regs = ([1] * 8) + ([2] * 8) + ([3] * 8)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Пример #22
0
    def test_aggregation(self):
        data = dataset_wizard(np.arange(20).reshape((4, 5)),
                              targets=1,
                              chunks=1)

        ag_data = aggregate_features(data, np.mean)

        ok_(ag_data.nsamples == 4)
        ok_(ag_data.nfeatures == 1)
        assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
Пример #23
0
    def test_coarsen_chunks(self):
        """Just basic testing for now"""
        chunks = [1, 1, 2, 2, 3, 3, 4, 4]
        ds = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks), 1)),
                            targets=[1] * 8,
                            chunks=chunks)
        coarsen_chunks(ds, nchunks=2)
        chunks1 = coarsen_chunks(chunks, nchunks=2)
        self.failUnless((chunks1 == ds.chunks).all())
        self.failUnless((chunks1 == np.asarray([0, 0, 0, 0, 1, 1, 1,
                                                1])).all())

        ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks), 1)),
                             targets=[1] * 8,
                             chunks=range(len(chunks)))
        coarsen_chunks(ds2, nchunks=2)
        self.failUnless((chunks1 == ds.chunks).all())
Пример #24
0
def test_mergeds2():
    """Test composition of new datasets by addition of existing ones
    """
    data = dataset_wizard([range(5)], targets=1, chunks=1)

    assert_array_equal(data.UT, [1])

    # simple sequence has to be a single pattern
    assert_equal(data.nsamples, 1)
    # check correct pattern layout (1x5)
    assert_array_equal(data.samples, [[0, 1, 2, 3, 4]])

    # check for single labels and origin
    assert_array_equal(data.targets, [1])
    assert_array_equal(data.chunks, [1])

    # now try adding pattern with wrong shape
    assert_raises(DatasetError,
                  data.append,
                  dataset_wizard(np.ones((2,3)), targets=1, chunks=1))

    # now add two real patterns
    dss = datasets['uni2large'].samples
    data.append(dataset_wizard(dss[:2, :5], targets=2, chunks=2))
    assert_equal(data.nfeatures, 5)
    assert_array_equal(data.targets, [1, 2, 2])
    assert_array_equal(data.chunks, [1, 2, 2])

    # test automatic origins
    data.append(dataset_wizard(dss[3:5, :5], targets=3, chunks=[0, 1]))
    assert_array_equal(data.chunks, [1, 2, 2, 0, 1])

    # test unique class labels
    assert_array_equal(data.UT, [1, 2, 3])

    # test wrong label length
    assert_raises(ValueError, dataset_wizard, dss[:4, :5], targets=[ 1, 2, 3 ],
                                         chunks=2)

    # test wrong origin length
    assert_raises(ValueError, dataset_wizard, dss[:4, :5], targets=[ 1, 2, 3, 4 ],
                                         chunks=[ 2, 2, 2 ])
Пример #25
0
    def test_invar_features_removal(self):
        r = np.random.normal(size=(3, 1))
        ds = dataset_wizard(samples=np.hstack((np.zeros((3, 2)), r)),
                            targets=1)

        self.failUnless(ds.nfeatures == 3)

        dsc = remove_invariant_features(ds)

        self.failUnless(dsc.nfeatures == 1)
        self.failUnless((dsc.samples == r).all())
Пример #26
0
    def test_invar_features_removal(self):
        r = np.random.normal(size=(3,1))
        ds = dataset_wizard(samples=np.hstack((np.zeros((3,2)), r)),
                     targets=1)

        self.failUnless(ds.nfeatures == 3)

        dsc = remove_invariant_features(ds)

        self.failUnless(dsc.nfeatures == 1)
        self.failUnless((dsc.samples == r).all())
Пример #27
0
def test_arrayattributes():
    samples = np.arange(12).reshape((4, 3))
    labels = range(4)
    chunks = [1, 1, 2, 2]
    ds = dataset_wizard(samples, labels, chunks)

    for a in (ds.samples, ds.targets, ds.chunks):
        ok_(isinstance(a, np.ndarray))

    ds.targets = labels
    ok_(isinstance(ds.targets, np.ndarray))

    ds.chunks = chunks
    ok_(isinstance(ds.chunks, np.ndarray))
Пример #28
0
def sin_modulated(n_instances, n_features, flat=False, noise=0.4):
    """ Generate a (quite) complex multidimensional non-linear dataset

    Used for regression testing. In the data label is a sin of a x^2 +
    uniform noise
    """
    if flat:
        data = np.arange(0.0, 1.0, 1.0 / n_instances) * np.pi
        data.resize(n_instances, n_features)
    else:
        data = np.random.rand(n_instances, n_features) * np.pi
    label = np.sin((data ** 2).sum(1)).round()
    label += np.random.rand(label.size) * noise
    return dataset_wizard(samples=data, targets=label)
Пример #29
0
def sin_modulated(n_instances, n_features,
                  flat=False, noise=0.4):
    """ Generate a (quite) complex multidimensional non-linear dataset

    Used for regression testing. In the data label is a sin of a x^2 +
    uniform noise
    """
    if flat:
        data = (np.arange(0.0, 1.0, 1.0/n_instances)*np.pi)
        data.resize(n_instances, n_features)
    else:
        data = np.random.rand(n_instances, n_features)*np.pi
    label = np.sin((data**2).sum(1)).round()
    label += np.random.rand(label.size)*noise
    return dataset_wizard(samples=data, targets=label)
Пример #30
0
def test_combined_samplesfeature_selection():
    data = dataset_wizard(np.arange(20).reshape((4, 5)).view(myarray),
                   targets=[1,2,3,4],
                   chunks=[5,6,7,8])

    # array subclass survives
    ok_(isinstance(data.samples, myarray))

    ok_(data.nsamples == 4)
    ok_(data.nfeatures == 5)
    sel = data[[0, 3], [1, 2]]
    ok_(sel.nsamples == 2)
    ok_(sel.nfeatures == 2)
    assert_array_equal(sel.targets, [1, 4])
    assert_array_equal(sel.chunks, [5, 8])
    assert_array_equal(sel.samples, [[1, 2], [16, 17]])
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    # should yield the same result if done sequentially
    sel2 = data[:, [1, 2]]
    sel2 = sel2[[0, 3]]
    assert_array_equal(sel.samples, sel2.samples)
    ok_(sel2.nsamples == 2)
    ok_(sel2.nfeatures == 2)
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    assert_raises(ValueError, data.__getitem__, (1, 2, 3))

    # test correct behavior when selecting just single rows/columns
    single = data[0]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 5)
    assert_array_equal(single.samples, [[0, 1, 2, 3, 4]])
    single = data[:, 0]
    ok_(single.nsamples == 4)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[0], [5], [10], [15]])
    single = data[1, 1]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[6]])
    # array subclass survives
    ok_(isinstance(single.samples, myarray))
Пример #31
0
def test_samplesgroup_mapper():
    data = np.arange(24).reshape(8,3)
    labels = [0, 1] * 4
    chunks = np.repeat(np.array((0,1)),4)

    # correct results
    csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]]
    clabels = [0, 1, 0, 1]
    cchunks = [0, 0, 1, 1]

    ds = dataset_wizard(samples=data, targets=labels, chunks=chunks)
    # add some feature attribute -- just to check
    ds.fa['checker'] = np.arange(3)
    ds.init_origids('samples')

    m = mean_group_sample(['targets', 'chunks'])
    mds = m.forward(ds)
    assert_array_equal(mds.samples, csamples)
    # FAs should simply remain the same
    assert_array_equal(mds.fa.checker, np.arange(3))

    # now without grouping
    m = mean_sample()
    # forwarding just the samples should yield the same result
    assert_array_equal(m.forward(ds.samples),
                       m.forward(ds).samples)

    # directly apply to dataset
    # using untrained mapper
    m = mean_group_sample(['targets', 'chunks'])
    mapped = ds.get_mapped(m)

    assert_equal(mapped.nsamples, 4)
    assert_equal(mapped.nfeatures, 3)
    assert_array_equal(mapped.samples, csamples)
    assert_array_equal(mapped.targets, clabels)
    assert_array_equal(mapped.chunks, cchunks)
    # make sure origids get regenerated
    assert_array_equal([s.count('+') for s in mapped.sa.origids], [1] * 4)

    # disbalanced dataset -- lets remove 0th sample so there is no target
    # 0 in 0th chunk
    ds_ = ds[[0, 1, 3, 5]]
    mapped = ds_.get_mapped(m)
    ok_(len(mapped) == 3)
    ok_(not None in mapped.sa.origids)
Пример #32
0
def test_samplesgroup_mapper():
    data = np.arange(24).reshape(8,3)
    labels = [0, 1] * 4
    chunks = np.repeat(np.array((0,1)),4)

    # correct results
    csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]]
    clabels = [0, 1, 0, 1]
    cchunks = [0, 0, 1, 1]

    ds = dataset_wizard(samples=data, targets=labels, chunks=chunks)
    # add some feature attribute -- just to check
    ds.fa['checker'] = np.arange(3)
    ds.init_origids('samples')

    m = mean_group_sample(['targets', 'chunks'])
    mds = m.forward(ds)
    assert_array_equal(mds.samples, csamples)
    # FAs should simply remain the same
    assert_array_equal(mds.fa.checker, np.arange(3))

    # now without grouping
    m = mean_sample()
    # forwarding just the samples should yield the same result
    assert_array_equal(m.forward(ds.samples),
                       m.forward(ds).samples)

    # directly apply to dataset
    # using untrained mapper
    m = mean_group_sample(['targets', 'chunks'])
    mapped = ds.get_mapped(m)

    assert_equal(mapped.nsamples, 4)
    assert_equal(mapped.nfeatures, 3)
    assert_array_equal(mapped.samples, csamples)
    assert_array_equal(mapped.targets, clabels)
    assert_array_equal(mapped.chunks, cchunks)
    # make sure origids get regenerated
    assert_array_equal([s.count('+') for s in mapped.sa.origids], [1] * 4)

    # disbalanced dataset -- lets remove 0th sample so there is no target
    # 0 in 0th chunk
    ds_ = ds[[0, 1, 3, 5]]
    mapped = ds_.get_mapped(m)
    ok_(len(mapped) == 3)
    ok_(not None in mapped.sa.origids)
Пример #33
0
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2, data_noise=0.4, noise=0.1):
    """ Generates simple dataset for linear regressions

    Generates chirp signal, populates n_nonbogus_features out of
    n_features with it with different noise level and then provides
    signal itself with additional noise as labels
    """
    x = np.linspace(0, 1, n_instances)
    y = np.sin((10 * np.pi * x ** 2))

    data = np.random.normal(size=(n_instances, n_features)) * data_noise
    for i in xrange(n_nonbogus_features):
        data[:, i] += y[:]

    labels = y + np.random.normal(size=(n_instances,)) * noise

    return dataset_wizard(samples=data, targets=labels)
Пример #34
0
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2,
                data_noise=0.4, noise=0.1):
    """ Generates simple dataset for linear regressions

    Generates chirp signal, populates n_nonbogus_features out of
    n_features with it with different noise level and then provides
    signal itself with additional noise as labels
    """
    x = np.linspace(0, 1, n_instances)
    y = np.sin((10 * np.pi * x **2))

    data = np.random.normal(size=(n_instances, n_features ))*data_noise
    for i in xrange(n_nonbogus_features):
        data[:, i] += y[:]

    labels = y + np.random.normal(size=(n_instances,))*noise

    return dataset_wizard(samples=data, targets=labels)
Пример #35
0
    def test_classifier(self):
        clf = ParametrizedClassifier()
        self.failUnlessEqual(len(clf.params.items()), 3)  # + targets # retrainable
        self.failUnlessEqual(len(clf.kernel_params.items()), 1)

        clfe = ParametrizedClassifierExtended()
        self.failUnlessEqual(len(clfe.params.items()), 3)
        self.failUnlessEqual(len(clfe.kernel_params.items()), 2)
        self.failUnlessEqual(len(clfe.kernel_params.listing), 2)

        # check assignment once again
        self.failUnlessEqual(clfe.kernel_params.kp2, 200.0)
        clfe.kernel_params.kp2 = 201.0
        self.failUnlessEqual(clfe.kernel_params.kp2, 201.0)
        self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), True)
        clfe.train(dataset_wizard(samples=[[0, 0]], targets=[1], chunks=[1]))
        self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), False)
        self.failUnlessEqual(clfe.kernel_params.is_set(), False)
        self.failUnlessEqual(clfe.params.is_set(), False)
Пример #36
0
def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
    """Generate a dataset from a linear function with AWGN
    (Added White Gaussian Noise).

    It can be multidimensional if 'slope' is a vector. If flat is True
    (in 1 dimesion) generate equally spaces samples instead of random
    ones. This is useful for the test phase.
    """
    dimensions = 1
    if isinstance(slope, np.ndarray):
        dimensions = slope.size

    if flat and dimensions == 1:
        x = np.linspace(0, 1, size)[:, np.newaxis]
    else:
        x = np.random.rand(size, dimensions)

    y = np.dot(x, slope)[:, np.newaxis] + (np.random.randn(*(x.shape[0], 1)) * noise_std) + intercept

    return dataset_wizard(samples=x, targets=y)
Пример #37
0
    def test_classifier(self):
        clf = ParametrizedClassifier()
        self.failUnlessEqual(len(clf.params.items()),
                             3)  # + targets # retrainable
        self.failUnlessEqual(len(clf.kernel_params.items()), 1)

        clfe = ParametrizedClassifierExtended()
        self.failUnlessEqual(len(clfe.params.items()), 3)
        self.failUnlessEqual(len(clfe.kernel_params.items()), 2)
        self.failUnlessEqual(len(clfe.kernel_params.listing), 2)

        # check assignment once again
        self.failUnlessEqual(clfe.kernel_params.kp2, 200.0)
        clfe.kernel_params.kp2 = 201.0
        self.failUnlessEqual(clfe.kernel_params.kp2, 201.0)
        self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), True)
        clfe.train(dataset_wizard(samples=[[0, 0]], targets=[1], chunks=[1]))
        self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), False)
        self.failUnlessEqual(clfe.kernel_params.is_set(), False)
        self.failUnlessEqual(clfe.params.is_set(), False)
Пример #38
0
def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
    """Generate a dataset from a linear function with AWGN
    (Added White Gaussian Noise).

    It can be multidimensional if 'slope' is a vector. If flat is True
    (in 1 dimesion) generate equally spaces samples instead of random
    ones. This is useful for the test phase.
    """
    dimensions = 1
    if isinstance(slope, np.ndarray):
        dimensions = slope.size

    if flat and dimensions == 1:
        x = np.linspace(0, 1, size)[:, np.newaxis]
    else:
        x = np.random.rand(size, dimensions)

    y = np.dot(x, slope)[:, np.newaxis] \
        + (np.random.randn(*(x.shape[0], 1)) * noise_std) + intercept

    return dataset_wizard(samples=x, targets=y)
Пример #39
0
def wr1996(size=200):
    """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)

    Was originally created in order to test the correctness of the
    implementation of kernel ARD.  For full details see:
    http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard

    x_1 picked randomly in [-1.932, -0.453]
    x_2 picked randomly in [0.534, 3.142]
    r_1 = 2.0
    r_2 = 1.3
    f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
    etc.

    Expected relevances:
    ell_1      1.804377
    ell_2      1.963956
    ell_3      8.884361
    ell_4     34.417657
    ell_5   1081.610451
    ell_6    375.445823
    sigma_f    2.379139
    sigma_n    0.050835
    """
    intervals = np.array([[-1.932, -0.453], [0.534, 3.142]])
    r = np.array([2.0, 1.3])
    x = np.random.rand(size, 2)
    x *= np.array(intervals[:, 1]-intervals[:, 0])
    x += np.array(intervals[:, 0])
    if __debug__:
        for i in xrange(2):
            debug('DG', '%d columnt Min: %g Max: %g' %
                  (i, x[:, i].min(), x[:, i].max()))
    y = r[0]*np.cos(x[:, 0] + r[1]*np.cos(x.sum(1))) + \
        np.random.randn(size)*np.sqrt(0.0025)
    y -= y.mean()
    x34 = x + np.random.randn(size, 2)*0.02
    x56 = np.random.randn(size, 2)
    x = np.hstack([x, x34, x56])
    return dataset_wizard(samples=x, targets=y)
Пример #40
0
def wr1996(size=200):
    """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)

    Was originally created in order to test the correctness of the
    implementation of kernel ARD.  For full details see:
    http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard

    x_1 picked randomly in [-1.932, -0.453]
    x_2 picked randomly in [0.534, 3.142]
    r_1 = 2.0
    r_2 = 1.3
    f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
    etc.

    Expected relevances:
    ell_1      1.804377
    ell_2      1.963956
    ell_3      8.884361
    ell_4     34.417657
    ell_5   1081.610451
    ell_6    375.445823
    sigma_f    2.379139
    sigma_n    0.050835
    """
    intervals = np.array([[-1.932, -0.453], [0.534, 3.142]])
    r = np.array([2.0, 1.3])
    x = np.random.rand(size, 2)
    x *= np.array(intervals[:, 1]-intervals[:, 0])
    x += np.array(intervals[:, 0])
    if __debug__:
        for i in xrange(2):
            debug('DG', '%d columnt Min: %g Max: %g' %
                  (i, x[:, i].min(), x[:, i].max()))
    y = r[0]*np.cos(x[:, 0] + r[1]*np.cos(x.sum(1))) + \
        np.random.randn(size)*np.sqrt(0.0025)
    y -= y.mean()
    x34 = x + np.random.randn(size, 2)*0.02
    x56 = np.random.randn(size, 2)
    x = np.hstack([x, x34, x56])
    return dataset_wizard(samples=x, targets=y)
Пример #41
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        zsm.train(ds1)
        ds1z = zsm.forward(ds1.samples)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Пример #42
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        zsm.train(ds1)
        ds1z = zsm.forward(ds1.samples)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Пример #43
0
    def test_binary_decorator(self):
        ds = dataset_wizard(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ],
                     targets=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
        testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ]
        # labels [s]ame/[d]ifferent (sign), and [p]ositive/[n]egative first element

        clf = SameSignClassifier()
        # lets create classifier to descriminate only between same/different,
        # which is a primary task of SameSignClassifier
        bclf1 = BinaryClassifier(clf=clf,
                                 poslabels=['sp', 'sn'],
                                 neglabels=['dp', 'dn'])

        orig_labels = ds.targets[:]
        bclf1.train(ds)

        self.failUnless(bclf1.predict(testdata) ==
                        [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'],
                         ['dn', 'dp'], ['dn', 'dp']])

        self.failUnless((ds.targets == orig_labels).all(),
                        msg="BinaryClassifier should not alter labels")
Пример #44
0
def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
    """Yet another generator of random dataset

    """
    # used in projection example
    x = []
    y = []
    labels = []
    for fx in sfx:
        nx = np.random.normal(size=size_per_fx)
        ny = fx(nx) + np.random.normal(size=nx.shape, scale=noise_std)
        x.append(nx)
        y.append(ny)

        # whenever larger than first function value
        labels.append(np.array(ny < dfx(nx), dtype='int'))

    samples = np.array((np.hstack(x), np.hstack(y))).squeeze().T
    labels = np.hstack(labels).squeeze().T

    samples += np.array(center)

    return dataset_wizard(samples=samples, targets=labels)
Пример #45
0
def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
    """Yet another generator of random dataset

    """
    # used in projection example
    x = []
    y = []
    labels = []
    for fx in sfx:
        nx = np.random.normal(size=size_per_fx)
        ny = fx(nx) + np.random.normal(size=nx.shape, scale=noise_std)
        x.append(nx)
        y.append(ny)

        # whenever larger than first function value
        labels.append(np.array(ny < dfx(nx), dtype='int'))

    samples = np.array((np.hstack(x), np.hstack(y))).squeeze().T
    labels = np.hstack(labels).squeeze().T

    samples += np.array(center)

    return dataset_wizard(samples=samples, targets=labels)
Пример #46
0
def test_arrayattributes():
    samples = np.arange(12).reshape((4, 3))
    labels = range(4)
    chunks = [1, 1, 2, 2]
    ds = dataset_wizard(samples, labels, chunks)

    for a in (ds.samples, ds.targets, ds.chunks):
        ok_(isinstance(a, np.ndarray))

    ds.targets = labels
    ok_(isinstance(ds.targets, np.ndarray))

    ds.chunks = chunks
    ok_(isinstance(ds.chunks, np.ndarray))

    # we should allow assigning somewhat more complex
    # iterables -- use ndarray of dtype object then
    # and possibly spit out a warning
    ds.sa['complex_list'] = [[], [1], [1, 2], []]
    ok_(ds.sa.complex_list.dtype == object)

    # but incorrect length should still fail
    assert_raises(ValueError, ds.sa.__setitem__,
                  'complex_list2', [[], [1], [1, 2]])
Пример #47
0
    def test_binary_decorator(self):
        ds = dataset_wizard(samples=[[0, 0], [0, 1], [1, 100], [-1, 0],
                                     [-1, -3], [0, -10]],
                            targets=['sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
        testdata = [[0, 0], [10, 10], [-10, -1], [0.1, -0.1], [-0.2, 0.2]]
        # labels [s]ame/[d]ifferent (sign), and [p]ositive/[n]egative first element

        clf = SameSignClassifier()
        # lets create classifier to descriminate only between same/different,
        # which is a primary task of SameSignClassifier
        bclf1 = BinaryClassifier(clf=clf,
                                 poslabels=['sp', 'sn'],
                                 neglabels=['dp', 'dn'])

        orig_labels = ds.targets[:]
        bclf1.train(ds)

        self.failUnless(
            bclf1.predict(testdata) == [['sp', 'sn'], ['sp', 'sn'], [
                'sp', 'sn'
            ], ['dn', 'dp'], ['dn', 'dp']])

        self.failUnless((ds.targets == orig_labels).all(),
                        msg="BinaryClassifier should not alter labels")
Пример #48
0
 def setUp(self):
     self.data = dataset_wizard(np.random.normal(size=(100,10)),
                         targets=[ i%4 for i in range(100) ],
                         chunks=[ i/10 for i in range(100)])
    bdot = base + '.'

    tcomp = [bdot + a for a in attrs]
    return tcomp


def activate():
    """Activate the PyMVPA Collections completer.
    """
    ipget().set_hook('complete_command', pymvpa_completer, re_key='.*')


#############################################################################
if __name__ == '__main__':
    # Testing/debugging, can be done only under interactive IPython session
    from mvpa.datasets.base import dataset_wizard
    t = dataset_wizard([1, 2, 3], targets=1, chunks=2)

    ip = ipget().IP

    assert (not 'targets' in ip.complete('t.sa.'))
    assert (not 'chunks' in ip.complete('t.sa.'))

    from ipy_pymvpa_completer import activate
    activate()
    # A few simplistic tests
    assert ip.complete('t.ed') == []
    assert ('targets' in ip.complete('t.sa.'))
    assert ('chunks' in ip.complete('t.sa.'))
    print 'Tests OK'
Пример #50
0
 def setUp(self):
     self.data = dataset_wizard(np.random.normal(size=(100, 10)),
                                targets=[i % 4 for i in range(100)],
                                chunks=[i / 10 for i in range(100)])
Пример #51
0
First import a necessary pieces of PyMVPA -- this time each bit individually.
"""
from mvpa.datasets.base import dataset_wizard
from mvpa.datasets.splitters import OddEvenSplitter
from mvpa.clfs.svm import LinearCSVMC
from mvpa.clfs.transerror import TransferError
from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
from mvpa.measures.searchlight import Searchlight
from mvpa.misc.data_generators import normal_feature_dataset

"""For the sake of simplicity, let's use a small artificial dataset."""

# overcomplicated way to generate an example dataset
ds = normal_feature_dataset(perlabel=10, nlabels=2, nchunks=2, nfeatures=10, nonbogus_features=[3, 7], snr=5.0)
dataset = dataset_wizard(samples=ds.samples, targets=ds.targets, chunks=ds.chunks)

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidatedTransferError(TransferError(LinearCSVMC()), OddEvenSplitter())

# setup searchlight with 5 mm radius and measure configured above
sl = Searchlight(cv, radius=5)

# run searchlight on dataset
sl_map = sl(dataset)

print "Best performing sphere error:", min(sl_map)
Пример #52
0
    def test_simple(self, oblique):
        d_orig = datasets['uni2large'].samples
        d_orig2 = datasets['uni4large'].samples
        for sdim, nf_s, nf_t, full_test \
                in (('Same 2D',  2,  2,  True),
                    ('Same 10D', 10, 10, True),
                    ('2D -> 3D', 2,  3,  True),
                    ('3D -> 2D', 3,  2,  False)):
            # figure out some "random" rotation
            d = max(nf_s, nf_t)
            R = get_random_rotation(nf_s, nf_t, d_orig)
            if nf_s == nf_t:
                adR = np.abs(1.0 - np.linalg.det(R))
                self.failUnless(adR < 1e-10,
                                "Determinant of rotation matrix should "
                                "be 1. Got it 1+%g" % adR)
                self.failUnless(norm(np.dot(R, R.T)
                                     - np.eye(R.shape[0])) < 1e-10)

            for s, scaling in ((0.3, True), (1.0, False)):
                pm = ProcrusteanMapper(scaling=scaling, oblique=oblique)
                pm2 = ProcrusteanMapper(scaling=scaling, oblique=oblique)

                t1, t2 = d_orig[23, 1], d_orig[22, 1]

                # Create source/target data
                d = d_orig[:, :nf_s]
                d_s = d + t1
                d_t = np.dot(s * d, R) + t2

                # train bloody mapper(s)
                ds = dataset_wizard(samples=d_s, targets=d_t)
                pm.train(ds)
                ## not possible with new interface
                #pm2.train(d_s, d_t)

                ## verify that both created the same transformation
                #npm2proj = norm(pm.proj - pm2.proj)
                #self.failUnless(npm2proj <= 1e-10,
                #                msg="Got transformation different by norm %g."
                #                " Had to be less than 1e-10" % npm2proj)
                #self.failUnless(norm(pm._offset_in - pm2._offset_in) <= 1e-10)
                #self.failUnless(norm(pm._offset_out - pm2._offset_out) <= 1e-10)

                # do forward transformation on the same source data
                d_s_f = pm.forward(d_s)

                self.failUnlessEqual(d_s_f.shape, d_t.shape,
                    msg="Mapped shape should be identical to the d_t")

                dsf = d_s_f - d_t
                ndsf = norm(dsf)/norm(d_t)
                if full_test:
                    dsR = norm(s*R - pm.proj)

                    if not oblique:
                        self.failUnless(dsR <= 1e-12,
                            msg="We should have got reconstructed rotation+scaling "
                                "perfectly. Now got d scale*R=%g" % dsR)

                        self.failUnless(np.abs(s - pm._scale) < 1e-12,
                            msg="We should have got reconstructed scale "
                                "perfectly. Now got %g for %g" % (pm._scale, s))

                    self.failUnless(ndsf <= 1e-12,
                      msg="%s: Failed to get to the target space correctly."
                        " normed error=%g" % (sdim, ndsf))

                # Test if we get back
                d_s_f_r = pm.reverse(d_s_f)

                dsfr = d_s_f_r - d_s
                ndsfr = norm(dsfr)/norm(d_s)
                if full_test:
                    self.failUnless(ndsfr <= 1e-12,
                      msg="%s: Failed to reconstruct into source space correctly."
                        " normed error=%g" % (sdim, ndsfr))
Пример #53
0
def test_zscore():
    """Test z-scoring transformation
    """
    # dataset: mean=2, std=1
    samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\
        reshape((16, 1))
    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    assert_equal(data.samples.mean(), 2.0)
    assert_equal(data.samples.std(), 1.0)
    data_samples = data.samples.copy()
    zscore(data, chunks_attr='chunks')

    # copy should stay intact
    assert_equal(data_samples.mean(), 2.0)
    assert_equal(data_samples.std(), 1.0)
    # we should be able to operate on ndarrays
    # But we can't change type inplace for an array, can't we?
    assert_raises(TypeError, zscore, data_samples, chunks_attr=None)
    # so lets do manually
    data_samples = data_samples.astype(float)
    zscore(data_samples, chunks_attr=None)
    assert_array_equal(data.samples, data_samples)
    print data_samples

    # check z-scoring
    check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0],
                    dtype='float64').reshape(16, 1)
    assert_array_equal(data.samples, check)

    data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16)
    zscore(data, chunks_attr=None)
    assert_array_equal(data.samples, check)

    # check z-scoring taking set of labels as a baseline
    data = dataset_wizard(samples.copy(),
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples + 1.0)

    # check that zscore modifies in-place; only guaranteed if no upcasting is
    # necessary
    samples = samples.astype('float')
    data = dataset_wizard(samples,
                   targets=[0, 2, 2, 2, 1] + [2] * 11,
                   chunks=[0] * 16)
    zscore(data, param_est=('targets', [0, 1]))
    assert_array_equal(samples, data.samples)

    # these might be duplicating code above -- but twice is better than nothing

    # dataset: mean=2, std=1
    raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2))
    # dataset: mean=12, std=1
    raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10
    # zscore target
    check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0]

    ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)
    pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16)

    zm = ZScoreMapper()
    # should do global zscore by default
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]))
    # should not modify the source
    assert_array_equal(pristine, ds)

    # if we tell it a different mean it should obey the order
    zm = ZScoreMapper(params=(3,1))
    zm.train(ds)
    assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 )
    assert_array_equal(pristine, ds)

    # let's look at chunk-wise z-scoring
    ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())),
                        targets=range(32),
                        chunks=[0] * 16 + [1] * 16)
    # by default chunk-wise
    zm = ZScoreMapper()
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
    # we should be able to do that same manually
    zm = ZScoreMapper(params={0: (2,1), 1: (12,1)})
    zm.train(ds)                        # train
    assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
Пример #54
0
    def test_glm(self):
        """Test GLM
        """
        # play fmri
        # full-blown HRF with initial dip and undershoot ;-)
        hrf_x = np.linspace(0, 25, 250)
        hrf = double_gamma_hrf(hrf_x) - single_gamma_hrf(hrf_x, 0.8, 1, 0.05)

        # come up with an experimental design
        samples = 1800
        fast_er_onsets = np.array([10, 200, 250, 500, 600, 900, 920, 1400])
        fast_er = np.zeros(samples)
        fast_er[fast_er_onsets] = 1

        # high resolution model of the convolved regressor
        model_hr = np.convolve(fast_er, hrf)[:samples]

        # downsample the regressor to fMRI resolution
        tr = 2.0
        model_lr = signal.resample(model_hr,
                                   int(samples / tr / 10),
                                   window='ham')

        # generate artifical fMRI data: two voxels one is noise, one has
        # something
        baseline = 800.0
        wsignal = baseline + 2 * model_lr + \
                  np.random.randn(int(samples / tr / 10)) * 0.2
        nsignal = baseline + np.random.randn(int(samples / tr / 10)) * 0.5

        # build design matrix: bold-regressor and constant
        X = np.array([model_lr, np.repeat(1, len(model_lr))]).T

        # two 'voxel' dataset
        data = dataset_wizard(samples=np.array((wsignal, nsignal, nsignal)).T, targets=1)

        # check GLM betas
        glm = GLM(X)
        betas = glm(data)

        # betas for each feature and each regressor
        self.failUnless(betas.shape == (X.shape[1], data.nfeatures))

        self.failUnless(np.absolute(betas.samples[1] - baseline < 10).all(),
            msg="baseline betas should be huge and around 800")

        self.failUnless(betas.samples[0,0] > betas[0,1],
            msg="feature (with signal) beta should be larger than for noise")

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(np.absolute(betas[0,1]) < 0.5)
            self.failUnless(np.absolute(betas[0,0]) > 1.0)


        # check GLM zscores
        glm = GLM(X, voi='zstat')
        zstats = glm(data)

        self.failUnless(zstats.shape == betas.shape)

        self.failUnless((zstats.samples[1] > 1000).all(),
                msg='constant zstats should be huge')

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(np.absolute(betas[0,0]) > betas[0,1],
                msg='with signal should have higher zstats')
Пример #55
0
def normal_feature_dataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
                         means=None, nonbogus_features=None, snr=3.0,
                         normalize=True):
    """Generate a univariate dataset with normal noise and specified means.

    Could be considered to be a generalization of
    `pure_multivariate_signal` where means=[ [0,1], [1,0] ].

    Specify either means or `nonbogus_features` so means get assigned
    accordingly.  If neither `means` nor `nonbogus_features` are
    provided, data will be pure noise and no per-label information.

    Parameters
    ----------
    perlabel : int, optional
      Number of samples per each label
    nlabels : int, optional
      Number of labels in the dataset
    nfeatures : int, optional
      Total number of features (including bogus features which carry
      no label-related signal)
    nchunks : int, optional
      Number of chunks (perlabel should be multiple of nchunks)
    means : None or ndarray of (nlabels, nfeatures) shape
      Specified means for each of features (columns) for all labels (rows).
    nonbogus_features : None or list of int
      Indexes of non-bogus features (1 per label).
    snr : float, optional
      Signal-to-noise ration assuming that signal has std 1.0 so we
      just divide random normal noise by snr
    normalize : bool, optional
      Divide by max(abs()) value to bring data into [-1, 1] range.
    """

    data = np.random.standard_normal((perlabel*nlabels, nfeatures))/np.sqrt(snr)
    if (means is None) and (not nonbogus_features is None):
        if len(nonbogus_features) > nlabels:
            raise ValueError, "Can't assign simply a feature to a " + \
                  "class: more nonbogus_features than labels"
        means = np.zeros((len(nonbogus_features), nfeatures))
        # pure multivariate -- single bit per feature
        for i in xrange(len(nonbogus_features)):
            means[i, nonbogus_features[i]] = 1.0
    if not means is None:
        # add mean
        data += np.repeat(np.array(means, ndmin=2), perlabel, axis=0)
    if normalize:
        # bring it 'under 1', since otherwise some classifiers have difficulties
        # during optimization
        data = 1.0/(np.max(np.abs(data))) * data
    labels = np.concatenate([np.repeat('L%d' % i, perlabel)
                                for i in range(nlabels)])
    chunks = np.concatenate([np.repeat(range(nchunks),
                                     perlabel/nchunks) for i in range(nlabels)])
    ds = dataset_wizard(data, targets=labels, chunks=chunks)

    # If nonbogus was provided -- assign .a and .fa accordingly
    if nonbogus_features is not None:
        ds.fa['targets'] = np.array([None]*nfeatures)
        ds.fa.targets[nonbogus_features] = ['L%d' % i for i in range(nlabels)]
        ds.a['nonbogus_features'] = nonbogus_features
        ds.a['bogus_features'] = [x for x in range(nfeatures)
                                  if not x in nonbogus_features]


    return ds
Пример #56
0
def normal_feature_dataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
                         means=None, nonbogus_features=None, snr=3.0,
                         normalize=True):
    """Generate a univariate dataset with normal noise and specified means.

    Could be considered to be a generalization of
    `pure_multivariate_signal` where means=[ [0,1], [1,0] ].

    Specify either means or `nonbogus_features` so means get assigned
    accordingly.  If neither `means` nor `nonbogus_features` are
    provided, data will be pure noise and no per-label information.

    Parameters
    ----------
    perlabel : int, optional
      Number of samples per each label
    nlabels : int, optional
      Number of labels in the dataset
    nfeatures : int, optional
      Total number of features (including bogus features which carry
      no label-related signal)
    nchunks : int, optional
      Number of chunks (perlabel should be multiple of nchunks)
    means : None or ndarray of (nlabels, nfeatures) shape
      Specified means for each of features (columns) for all labels (rows).
    nonbogus_features : None or list of int
      Indexes of non-bogus features (1 per label).
    snr : float, optional
      Signal-to-noise ration assuming that signal has std 1.0 so we
      just divide random normal noise by snr
    normalize : bool, optional
      Divide by max(abs()) value to bring data into [-1, 1] range.
    """

    data = np.random.standard_normal((perlabel*nlabels, nfeatures))/np.sqrt(snr)
    if (means is None) and (not nonbogus_features is None):
        if len(nonbogus_features) > nlabels:
            raise ValueError, "Can't assign simply a feature to a " + \
                  "class: more nonbogus_features than labels"
        means = np.zeros((len(nonbogus_features), nfeatures))
        # pure multivariate -- single bit per feature
        for i in xrange(len(nonbogus_features)):
            means[i, nonbogus_features[i]] = 1.0
    if not means is None:
        # add mean
        data += np.repeat(np.array(means, ndmin=2), perlabel, axis=0)
    if normalize:
        # bring it 'under 1', since otherwise some classifiers have difficulties
        # during optimization
        data = 1.0/(np.max(np.abs(data))) * data
    labels = np.concatenate([np.repeat('L%d' % i, perlabel)
                                for i in range(nlabels)])
    chunks = np.concatenate([np.repeat(range(nchunks),
                                     perlabel/nchunks) for i in range(nlabels)])
    ds = dataset_wizard(data, targets=labels, chunks=chunks)

    # If nonbogus was provided -- assign .a and .fa accordingly
    if nonbogus_features is not None:
        ds.fa['nonbogus_targets'] = np.array([None]*nfeatures)
        ds.fa.nonbogus_targets[nonbogus_features] = ['L%d' % i for i in range(nlabels)]
        ds.a['nonbogus_features'] = nonbogus_features
        ds.a['bogus_features'] = [x for x in range(nfeatures)
                                  if not x in nonbogus_features]


    return ds