Exemplo n.º 1
0
    def _call(self, dataset):
        """Computes featurewise f-scores using compound comparisons."""

        targets_sa = dataset.sa[self._targets_attr]
        orig_labels = targets_sa.value
        labels = orig_labels.copy()

        # Lets create a very shallow copy of a dataset with just
        # samples and targets_attr
        dataset_mod = Dataset(dataset.samples, sa={self._targets_attr: labels})
        results = []
        for ul in targets_sa.unique:
            labels[orig_labels == ul] = 1
            labels[orig_labels != ul] = 2
            f_ds = OneWayAnova._call(self, dataset_mod)
            if 'fprob' in f_ds.fa:
                # rename the fprob attribute to something label specific
                # to survive final aggregation stage
                f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob
                del f_ds.fa['fprob']
            results.append(f_ds)

        results = vstack(results)
        results.sa[self._targets_attr] = targets_sa.unique
        return results
Exemplo n.º 2
0
    def _call(self, dataset):
        """Computes featurewise f-scores using compound comparisons."""

        targets_sa = dataset.sa[self._targets_attr]
        orig_labels = targets_sa.value
        labels = orig_labels.copy()

        # Lets create a very shallow copy of a dataset with just
        # samples and targets_attr
        dataset_mod = Dataset(dataset.samples,
                              sa={self._targets_attr : labels})
        results = []
        for ul in targets_sa.unique:
            labels[orig_labels == ul] = 1
            labels[orig_labels != ul] = 2
            f_ds = OneWayAnova._call(self, dataset_mod)
            if 'fprob' in f_ds.fa:
                # rename the fprob attribute to something label specific
                # to survive final aggregation stage
                f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob
                del f_ds.fa['fprob']
            results.append(f_ds)

        results = vstack(results)
        results.sa[self._targets_attr] = targets_sa.unique
        return results
Exemplo n.º 3
0
def test_mergeds():
    data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1)
    data0.fa['one'] = np.ones(5)
    data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1)
    data1.fa['one'] = np.zeros(5)
    data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1)
    data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2)
    data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2)
    data4.fa['test'] = np.arange(5)

    # cannot merge if there are attributes missing in one of the datasets
    assert_raises(DatasetError, data1.append, data0)

    merged = data1.copy()
    merged.append(data2)

    ok_( merged.nfeatures == 5 )
    l12 = [1]*5 + [2]*3
    l1 = [1]*8
    ok_((merged.targets == l12).all())
    ok_((merged.chunks == l1).all())

    data_append = data1.copy()
    data_append.append(data2)

    ok_(data_append.nfeatures == 5)
    ok_((data_append.targets == l12).all())
    ok_((data_append.chunks == l1).all())

    #
    # appending
    #

    # we need the same samples attributes in both datasets
    assert_raises(DatasetError, data2.append, data3)

    #
    # vstacking
    #
    if __debug__:
        # tested only in __debug__
        assert_raises(ValueError, vstack, (data0, data1, data2, data3))
    datasets = (data1, data2, data4)
    merged = vstack(datasets)
    assert_equal(merged.shape,
                 (np.sum([len(ds) for ds in datasets]), data1.nfeatures))
    assert_true('test' in merged.fa)
    assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2)

    #
    # hstacking
    #
    assert_raises(ValueError, hstack, datasets)
    datasets = (data0, data1)
    merged = hstack(datasets)
    assert_equal(merged.shape,
                 (len(data1), np.sum([ds.nfeatures for ds in datasets])))
    assert_true('chunks' in merged.sa)
    assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
Exemplo n.º 4
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good
                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.failUnless(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.failUnless(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.failUnless(resds.nfeatures == e.argmin() + 1)


        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.failUnless((resds.samples[:,0] == signal.samples[:,0]).all())
Exemplo n.º 5
0
    def _call(self, dataset):
        """Computes featurewise f-scores using compound comparisons."""

        orig_labels = dataset.targets
        labels = orig_labels.copy()

        results = []
        for ul in dataset.sa['targets'].unique:
            labels[orig_labels == ul] = 1
            labels[orig_labels != ul] = 2
            f_ds = OneWayAnova._call(self, dataset, labels)
            if 'fprob' in f_ds.fa:
                # rename the fprob attribute to something label specific
                # to survive final aggregation stage
                f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob
                del f_ds.fa['fprob']
            results.append(f_ds)

        results = vstack(results)
        results.sa['targets'] = dataset.sa['targets'].unique
        return results
Exemplo n.º 6
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == 'remove':
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d)
                     for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Exemplo n.º 7
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == 'remove':
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = CustomSplitter([((i, ), )
                               for i in ds.sa[self.__chunks_attr].unique],
                              attr=self.__chunks_attr)
         dses = [self._forward_dataset_helper(d[0]) for d in spl(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Exemplo n.º 8
0
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(
            TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(
            TransferError(clf_), NFoldSplitter())

        ds = datasets['uni2large_test'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))     # assure that there are None
        ck.compute(ds)                  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')      # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)
Exemplo n.º 9
0
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)  # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(TransferError(clf_),
                                            NFoldSplitter())

        ds = datasets['uni2large_test'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))  # assure that there are None
        ck.compute(ds)  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')  # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)