예제 #1
0
def test_sifter_with_balancing():
    # extended previous test which was already
    # "... somewhat duplicating the doctest"
    ds = Dataset(samples=np.arange(12).reshape((-1, 2)),
                 sa={'chunks':   [ 0 ,  1 ,  2 ,  3 ,  4,   5 ],
                     'targets':  ['c', 'c', 'c', 'p', 'p', 'p']})

    # Without sifter -- just to assure that we do get all of them
    # i.e. 6*5*4*3/(4!) = 15
    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')])
    assert_equal(len(list(par.generate(ds))), 15)

    # so we will take 4 chunks out of available 7, but would care only
    # about those partitions where we have balanced number of 'c' and 'p'
    # entries
    assert_raises(ValueError,
                  lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)),
                  ds)

    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks'),
                     Sifter([('partitions', 2),
                             ('targets',
                              dict(uvalues=['c', 'p'],
                                   balanced=True))])
                     ])
    dss = list(par.generate(ds))
    # print [ x[x.sa.partitions==2].sa.targets for x in dss ]
    assert_equal(len(dss), 9)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #2
0
def test_sifter_with_balancing():
    # extended previous test which was already
    # "... somewhat duplicating the doctest"
    ds = Dataset(samples=np.arange(12).reshape((-1, 2)),
                 sa={
                     'chunks': [0, 1, 2, 3, 4, 5],
                     'targets': ['c', 'c', 'c', 'p', 'p', 'p']
                 })

    # Without sifter -- just to assure that we do get all of them
    # i.e. 6*5*4*3/(4!) = 15
    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')])
    assert_equal(len(list(par.generate(ds))), 15)

    # so we will take 4 chunks out of available 7, but would care only
    # about those partitions where we have balanced number of 'c' and 'p'
    # entries
    assert_raises(
        ValueError,
        lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds)

    par = ChainNode([
        NFoldPartitioner(cvtype=4, attr='chunks'),
        Sifter([('partitions', 2),
                ('targets', dict(uvalues=['c', 'p'], balanced=True))])
    ])
    dss = list(par.generate(ds))
    # print [ x[x.sa.partitions==2].sa.targets for x in dss ]
    assert_equal(len(dss), 9)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #3
0
def test_permute_superord():
    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter
    from mvpa2.generators.permutation import AttributePermutator

    ds = _get_superord_dataset()
    # mvpa2.seed(1)
    part = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
            AttributePermutator(['superord'], limit=['partitions', 'chunks']),
        ],
        space='partitions')

    for ds_perm in part.generate(ds):
        # it does permutation
        assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
예제 #4
0
def test_permute_superord():
    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import  Sifter
    from mvpa2.generators.permutation import AttributePermutator

    ds = _get_superord_dataset()
    # mvpa2.seed(1)
    part = ChainNode([
    ## so we split based on superord
        NFoldPartitioner(len(ds.sa['superord'].unique),
                         attr='subord'),
        ## so it should select only those splits where we took 1 from
        ## each of the superord categories leaving things in balance
        Sifter([('partitions', 2),
                ('superord',
                 { 'uvalues': ds.sa['superord'].unique,
                   'balanced': True})]),
        AttributePermutator(['superord'], limit=['partitions',
                                                 'chunks']),
    ], space='partitions')

    for ds_perm in part.generate(ds):
        # it does permutation
        assert(np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
예제 #5
0
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([NFoldPartitioner(attr='subjects'),
                             ExcludeTargetsCombinationsPartitioner(
                                 k=1,
                                 targets_attr='chunks',
                                 space='partitions')],
                            space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                 sa={'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3})
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition]))
        testing_subjs.append(testing_subj)
        testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition]))
        testing_chunks.append(testing_chunk)
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
예제 #6
0
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([NFoldPartitioner(attr='subjects'),
                             ExcludeTargetsCombinationsPartitioner(
                                 k=1,
                                 targets_attr='chunks',
                                 space='partitions')],
                            space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                 sa={'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3})
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition]))
        testing_subjs.append(testing_subj)
        testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition]))
        testing_chunks.append(testing_chunk)
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
예제 #7
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
예제 #8
0
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([
            NFoldPartitioner(cvtype=1),
            Balancer(attr='targets',
                     count=2,
                     limit='partitions',
                     apply_selection=True)
        ])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin,
                               part,
                               enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
예제 #9
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets',
                    attr_values=[0, 1, 1, 2, 3, 3, 3],
                    count=4,
                    noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0, 1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
예제 #10
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4,
                   noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0,1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
예제 #11
0
def test_sifter():
    # somewhat duplicating the doctest
    ds = Dataset(samples=np.arange(8).reshape((4,2)),
                 sa={'chunks':   [ 0 ,  1 ,  2 ,  3 ],
                     'targets':  ['c', 'c', 'p', 'p']})
    par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'),
                     Sifter([('partitions', 2),
                             ('targets', ['c', 'p'])])
                     ])
    dss = list(par.generate(ds))
    assert_equal(len(dss), 4)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #12
0
def test_sifter():
    # somewhat duplicating the doctest
    ds = Dataset(samples=np.arange(8).reshape((4, 2)), sa={"chunks": [0, 1, 2, 3], "targets": ["c", "c", "p", "p"]})
    for sift_targets_definition in (["c", "p"], dict(uvalues=["c", "p"])):
        par = ChainNode(
            [
                NFoldPartitioner(cvtype=2, attr="chunks"),
                Sifter([("partitions", 2), ("targets", sift_targets_definition)]),
            ]
        )
        dss = list(par.generate(ds))
        assert_equal(len(dss), 4)
        for ds_ in dss:
            testing = ds[ds_.sa.partitions == 2]
            assert_array_equal(np.unique(testing.sa.targets), ["c", "p"])
            # and we still have both targets  present in training
            training = ds[ds_.sa.partitions == 1]
            assert_array_equal(np.unique(training.sa.targets), ["c", "p"])
예제 #13
0
    def test_discarded_boundaries(self):
        ds = datasets["hollow"]
        # four runs
        ds.sa["chunks"] = np.repeat(np.arange(4), 10)
        # do odd even splitting for lots of boundaries in few splits
        part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples("chunks", 1, 2)])

        parts = [d.samples.sid for d in part.generate(ds)]

        # both dataset should have the same samples, because the boundaries are
        # identical and the same sample should be stripped
        assert_array_equal(parts[0], parts[1])

        # we strip 3 samples per boundary
        assert_equal(len(parts[0]), len(ds) - (3 * 3))

        for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]:
            assert_false(i in parts[0])
예제 #14
0
    def test_discarded_boundaries(self):
        ds = datasets['hollow']
        # four runs
        ds.sa['chunks'] = np.repeat(np.arange(4), 10)
        # do odd even splitting for lots of boundaries in few splits
        part = ChainNode([OddEvenPartitioner(),
                          StripBoundariesSamples('chunks', 1, 2)])

        parts = [d.samples.sid for d in part.generate(ds)]

        # both dataset should have the same samples, because the boundaries are
        # identical and the same sample should be stripped
        assert_array_equal(parts[0], parts[1])

        # we strip 3 samples per boundary
        assert_equal(len(parts[0]), len(ds) - (3 * 3))

        for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]:
            assert_false(i in parts[0])
예제 #15
0
def test_sifter():
    # somewhat duplicating the doctest
    ds = Dataset(samples=np.arange(8).reshape((4,2)),
                 sa={'chunks':   [ 0 ,  1 ,  2 ,  3 ],
                     'targets':  ['c', 'c', 'p', 'p']})
    for sift_targets_definition in (['c', 'p'],
                                    dict(uvalues=['c', 'p'])):
        par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'),
                         Sifter([('partitions', 2),
                                 ('targets', sift_targets_definition)])
                         ])
        dss = list(par.generate(ds))
        assert_equal(len(dss), 4)
        for ds_ in dss:
            testing = ds[ds_.sa.partitions == 2]
            assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
            # and we still have both targets  present in training
            training = ds[ds_.sa.partitions == 1]
            assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
예제 #16
0
def test_exclude_targets_combinations():
    partitioner = ChainNode(
        [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")],
        space="partitions",
    )
    from mvpa2.misc.data_generators import normal_feature_dataset

    ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter("partitions")
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)), 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
예제 #17
0
파일: test_clf.py 프로젝트: Anhmike/PyMVPA
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([NFoldPartitioner(cvtype=1),
                          Balancer(attr='targets', count=2,
                                   limit='partitions', apply_selection=True)])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
예제 #18
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'),
                                    attr='superord')

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_1super)
    ]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [
            p.sa.partitions for p in factpart.generate(ds_unbalanced)
        ]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(partitions_factpart, partitions_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_dummy)
    ]
    assert_array_equal(
        partitions_factpart,
        [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
예제 #19
0
파일: base.py 프로젝트: reka-daniel/PyMVPA
def plot_feature_hist(dataset,
                      xlim=None,
                      noticks=True,
                      targets_attr='targets',
                      chunks_attr=None,
                      **kwargs):
    """Plot histograms of feature values for each labels.

    Parameters
    ----------
    dataset : Dataset
    xlim : None or 2-tuple
      Common x-axis limits for all histograms.
    noticks : bool
      If True, no axis ticks will be plotted. This is useful to save
      space in large plots.
    targets_attr : string, optional
      Name of samples attribute to be used as targets
    chunks_attr : None or string
      If a string, a histogram will be plotted per each target and each
      chunk (as defined in sa named `chunks_attr`), resulting is a
      histogram grid (targets x chunks).
    **kwargs
      Any additional arguments are passed to matplotlib's hist().
    """
    lsplit = ChainNode([
        NFoldPartitioner(1, attr=targets_attr),
        Splitter('partitions', attr_values=[2])
    ])
    csplit = ChainNode([
        NFoldPartitioner(1, attr=chunks_attr),
        Splitter('partitions', attr_values=[2])
    ])

    nrows = len(dataset.sa[targets_attr].unique)
    ncols = len(dataset.sa[chunks_attr].unique)

    def doplot(data):
        """Just a little helper which plots the histogram and removes
        ticks etc"""

        pl.hist(data, **kwargs)

        if xlim is not None:
            pl.xlim(xlim)

        if noticks:
            pl.yticks([])
            pl.xticks([])

    fig = 1

    # for all labels
    for row, ds in enumerate(lsplit.generate(dataset)):
        if chunks_attr:
            for col, d in enumerate(csplit.generate(ds)):

                pl.subplot(nrows, ncols, fig)
                doplot(d.samples.ravel())

                if row == 0:
                    pl.title('C:' + str(d.sa[chunks_attr].unique[0]))
                if col == 0:
                    pl.ylabel('L:' + str(d.sa[targets_attr].unique[0]))

                fig += 1
        else:
            pl.subplot(1, nrows, fig)
            doplot(ds.samples)

            pl.title('L:' + str(ds.sa[targets_attr].unique[0]))

            fig += 1
예제 #20
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5  # pure signal! ;)
    )
    ds.sa["subord"] = ds.sa.targets.copy()
    ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    # ds_unbalanced = ds.copy()
    # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    # mask_superord = ds_unbalanced.sa.superord == 'super1'
    # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]})

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]),
        ],
        space="partitions",
    )

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord")

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr="subord")
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = (
        "One or more superordinate attributes do not have the same "
        "number of subordinate attributes. This could yield to "
        "unbalanced partitions."
    )
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in zip(
        partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced
    ):
        assert_array_equal(out_part, true_part)
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()),
            super_out,
        )
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out
        )

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2})
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)]
    assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
예제 #21
0
파일: base.py 프로젝트: arnaudsj/PyMVPA
def plot_feature_hist(dataset, xlim=None, noticks=True,
                      targets_attr='targets', chunks_attr=None,
                    **kwargs):
    """Plot histograms of feature values for each labels.

    Parameters
    ----------
    dataset : Dataset
    xlim : None or 2-tuple
      Common x-axis limits for all histograms.
    noticks : bool
      If True, no axis ticks will be plotted. This is useful to save
      space in large plots.
    targets_attr : string, optional
      Name of samples attribute to be used as targets
    chunks_attr : None or string
      If a string, a histogram will be plotted per each target and each
      chunk (as defined in sa named `chunks_attr`), resulting is a
      histogram grid (targets x chunks).
    **kwargs
      Any additional arguments are passed to matplotlib's hist().
    """
    lsplit = ChainNode([NFoldPartitioner(1, attr=targets_attr),
                        Splitter('partitions', attr_values=[2])])
    csplit = ChainNode([NFoldPartitioner(1, attr=chunks_attr),
                        Splitter('partitions', attr_values=[2])])

    nrows = len(dataset.sa[targets_attr].unique)
    ncols = len(dataset.sa[chunks_attr].unique)

    def doplot(data):
        """Just a little helper which plots the histogram and removes
        ticks etc"""

        pl.hist(data, **kwargs)

        if xlim is not None:
            pl.xlim(xlim)

        if noticks:
            pl.yticks([])
            pl.xticks([])

    fig = 1

    # for all labels
    for row, ds in enumerate(lsplit.generate(dataset)):
        if chunks_attr:
            for col, d in enumerate(csplit.generate(ds)):

                pl.subplot(nrows, ncols, fig)
                doplot(d.samples.ravel())

                if row == 0:
                    pl.title('C:' + str(d.sa[chunks_attr].unique[0]))
                if col == 0:
                    pl.ylabel('L:' + str(d.sa[targets_attr].unique[0]))

                fig += 1
        else:
            pl.subplot(1, nrows, fig)
            doplot(ds.samples)

            pl.title('L:' + str(ds.sa[targets_attr].unique[0]))

            fig += 1