def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless( len(xvpat) == 10 ) for i,p in enumerate(xvpat): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 90 ) self.failUnless( p[1].nsamples == 10 ) self.failUnless( p[1].chunks[0] == i )
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [ (nchunks*2, nchunks), (nchunks, nchunks), (nchunks-1, nchunks-1), (3, 3), (0, 0), (1, 1) ]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.failUnlessEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.failUnless(ds_.a.lastpartitionset) # test all now for isplit,split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit==nsplits-1 # Check results of different strategies if strategy == 'first': self.failUnlessEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.failUnlessEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.failUnless(len(set(chosenchunks)) == len(chosenchunks)) self.failUnless(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_analyzer_with_split_classifier(self, clfds): """Test analyzers in split classifier """ clf, ds = clfds # unroll the tuple # We need to skip some LARSes here _sclf = str(clf) if 'LARS(' in _sclf and "type='stepwise'" in _sclf: # ADD KnownToFail thingie from NiPy return # To don't waste too much time testing lets limit to 3 splits nsplits = 3 partitioner = NFoldPartitioner(count=nsplits) mclf = SplitClassifier(clf=clf, partitioner=partitioner, enable_ca=['training_stats', 'stats']) sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(), enable_ca=["sensitivities"]) ulabels = ds.uniquetargets nlabels = len(ulabels) # Can't rely on splitcfg since count-limit is done in __call__ assert(nsplits == len(list(partitioner.generate(ds)))) sens = sana(ds) # It should return either ... # nlabels * nsplits req_nsamples = [ nlabels * nsplits ] if nlabels == 2: # A single sensitivity in case of binary req_nsamples += [ nsplits ] else: # and for pairs in case of multiclass req_nsamples += [ (nlabels * (nlabels-1) / 2) * nsplits ] # and for 1-vs-1 embedded within Multiclass operating on # pairs (e.g. SMLR) req_nsamples += [req_nsamples[-1]*2] # Also for regression_based -- they can do multiclass # but only 1 sensitivity is provided if 'regression_based' in clf.__tags__: req_nsamples += [ nsplits ] # # of features should correspond self.failUnlessEqual(sens.shape[1], ds.nfeatures) # # of samples/sensitivities should also be reasonable self.failUnless(sens.shape[0] in req_nsamples) # Check if labels are present self.failUnless('splits' in sens.sa) self.failUnless('targets' in sens.sa) # should be 1D -- otherwise dtype object self.failUnless(sens.sa.targets.ndim == 1) sens_ulabels = sens.sa['targets'].unique # Some labels might be pairs(tuples) so ndarray would be of # dtype object and we would need to get them all if sens_ulabels.dtype is np.dtype('object'): sens_ulabels = np.unique( reduce(lambda x,y: x+y, [list(x) for x in sens_ulabels])) assert_array_equal(sens_ulabels, ds.sa['targets'].unique) errors = [x.percent_correct for x in sana.clf.ca.stats.matrices] # lets go through all sensitivities and see if we selected the right # features #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2: if '5%' in clf.descr \ or (nlabels > 2 and 'regression_based' in clf.__tags__): # Some meta classifiers (5% of ANOVA) are too harsh ;-) # if we get less than 2 features with on-zero sensitivities we # cannot really test # Also -- regression based classifiers performance for multiclass # is expected to suck in general return if cfg.getboolean('tests', 'labile', default='yes'): for conf_matrix in [sana.clf.ca.training_stats] \ + sana.clf.ca.stats.matrices: self.failUnless( conf_matrix.percent_correct>=70, msg="We must have trained on each one more or " \ "less correctly. Got %f%% correct on %d labels" % (conf_matrix.percent_correct, nlabels)) # Since now we have per split and possibly per label -- lets just find # mean per each feature per label across splits sensm = FxMapper('samples', lambda x: np.sum(x), uattrs=['targets']).forward(sens) sensgm = maxofabs_sample().forward(sensm) # global max of abs of means assert_equal(sensgm.shape[0], 1) assert_equal(sensgm.shape[1], ds.nfeatures) selected = FixedNElementTailSelector( len(ds.a.bogus_features))(sensgm.samples[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnlessEqual( set(selected), set(ds.a.nonbogus_features), msg="At the end we should have selected the right features. " "Chose %s whenever nonbogus are %s" % (selected, ds.a.nonbogus_features)) # Now test each one per label # TODO: collect all failures and spit them out at once -- # that would make it easy to see if the sensitivity # just has incorrect order of labels assigned for sens1 in sensm: labels1 = sens1.targets # labels (1) for this sensitivity lndim = labels1.ndim label = labels1[0] # current label # XXX whole lndim comparison should be gone after # things get fixed and we arrive here with a tuple! if lndim == 1: # just a single label self.failUnless(label in ulabels) ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0] # should have just 1 feature for the label self.failUnlessEqual(len(ilabel_all), 1) ilabel = ilabel_all[0] maxsensi = np.argmax(sens1) # index of max sensitivity self.failUnlessEqual(maxsensi, ilabel, "Maximal sensitivity for %s was found in %i whenever" " original feature was %i for nonbogus features %s" % (labels1, maxsensi, ilabel, ds.a.nonbogus_features)) elif lndim == 2 and labels1.shape[1] == 2: # pair of labels # we should have highest (in abs) coefficients in # those two labels maxsensi2 = np.argsort(np.abs(sens1))[0][-2:] ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0] for l in label] self.failUnlessEqual( set(maxsensi2), set(ilabel2), "Maximal sensitivity for %s was found in %s whenever" " original features were %s for nonbogus features %s" % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features)) """ # Now test for the sign of each one in pair ;) in # all binary problems L1 (-1) -> L2(+1), then # weights for L2 should be positive. to test for # L1 -- invert the sign # We already know (if we haven't failed in previous test), # that those 2 were the strongest -- so check only signs """ self.failUnless( sens1.samples[0, ilabel2[0]]<0, "With %i classes in pair %s got feature %i for %r >= 0" % (nlabels, label, ilabel2[0], label[0])) self.failUnless(sens1.samples[0, ilabel2[1]]>0, "With %i classes in pair %s got feature %i for %r <= 0" % (nlabels, label, ilabel2[1], label[1])) else: # yoh could be wrong at this assumption... time will show self.fail("Got unknown number labels per sensitivity: %s." " Should be either a single label or a pair" % labels1)