Пример #1
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if (
            __debug__
            and not chunks_attr is None
            and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2
        ):
            warning(
                "Z-scoring chunk-wise having a chunk with less than three "
                "samples will set features in these samples to either zero "
                "(with 1 sample in a chunk) "
                "or -1/+1 (with 2 samples in a chunk)."
            )

        params = self.__params_dict
        if params is None:
            raise RuntimeError, "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if "__all__" in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params["__all__"])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?" % (self.__class__.__name__, c)
                    )
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer], *params[c])

        return mds
Пример #2
0
    def _forward_dataset(self, ds):
        # local binding
        chunks_attr = self.__chunks_attr
        dtype = self.__dtype

        if __debug__ and not chunks_attr is None \
          and np.array(get_nsamples_per_attr(ds, chunks_attr).values()).min() <= 2:
            warning("Z-scoring chunk-wise having a chunk with less than three "
                    "samples will set features in these samples to either zero "
                    "(with 1 sample in a chunk) "
                    "or -1/+1 (with 2 samples in a chunk).")

        params = self.__params_dict
        if params is None:
            raise RuntimeError, \
                  "ZScoreMapper needs to be trained before call to forward"

        if self._secret_inplace_zscore:
            mds = ds
        else:
            # shallow copy to put the new stuff in
            mds = ds.copy(deep=False)

        # cast the data to float, since in-place operations below do not upcast!
        if np.issubdtype(mds.samples.dtype, np.integer):
            mds.samples = mds.samples.astype(dtype)

        if '__all__' in params:
            # we have a global parameter set
            mds.samples = self._zscore(mds.samples, *params['__all__'])
        else:
            # per chunk z-scoring
            for c in mds.sa[chunks_attr].unique:
                if not c in params:
                    raise RuntimeError(
                        "%s has no parameters for chunk '%s'. It probably "
                        "wasn't present in the training dataset!?"
                        % (self.__class__.__name__, c))
                slicer = np.where(mds.sa[chunks_attr].value == c)[0]
                mds.samples[slicer] = self._zscore(mds.samples[slicer],
                                                   *params[c])

        return mds
Пример #3
0
    def __call__(self, dataset):
        """Splits the dataset.

        This method behaves like a generator.
        """

        # local bindings to methods to gain some speedup
        ds_class = dataset.__class__

        # for each split
        cfgs = self.splitcfg(dataset)
        n_cfgs = len(cfgs)

        # Finally split the data
        for isplit, split in enumerate(cfgs):

            # determine sample sizes
            if not operator.isSequenceType(self.__npertarget) \
                   or isinstance(self.__npertarget, str):
                npertargetsplit = [self.__npertarget] * len(split)
            else:
                npertargetsplit = self.__npertarget

            # get splitted datasets
            split_ds = self.split_dataset(dataset, split)

            # do multiple post-processing runs for this split
            for run in xrange(self.__runspersplit):

                # post-process all datasets
                finalized_datasets = []

                for ds, npertarget in zip(split_ds, npertargetsplit):
                    # Set flag of dataset either this was the last split
                    # ??? per our discussion this might be the best
                    #     solution which would scale if we care about
                    #     thread-safety etc
                    if ds is not None:
                        ds_a = ds.a
                        lastsplit = (isplit == n_cfgs - 1)
                        if not ds_a.has_key('lastsplit'):
                            # if not yet known -- add one
                            ds_a['lastsplit'] = lastsplit
                        else:
                            # otherwise just assign a new value
                            ds_a.lastsplit = lastsplit
                    # permute the labels
                    if self.__permute_attr is not None:
                        permute_attr(ds,
                                     attr=self.__permute_attr,
                                     chunks_attr='chunks',
                                     col='sa')

                    # select subset of samples if requested
                    if npertarget == 'all' or ds is None:
                        finalized_datasets.append(ds)
                    else:
                        # We need to select a subset of samples
                        # TODO: move all this logic within random_sample

                        # go for maximum possible number of samples provided
                        # by each label in this dataset
                        if npertarget == 'equal':
                            # determine the min number of samples per class
                            npl = np.array(
                                get_nsamples_per_attr(
                                    ds, 'targets').values()).min()
                        elif isinstance(npertarget, float) or (
                                operator.isSequenceType(npertarget)
                                and len(npertarget) > 0
                                and isinstance(npertarget[0], float)):
                            # determine number of samples per class and take
                            # a ratio
                            counts = np.array(
                                get_nsamples_per_attr(ds, 'targets').values())
                            npl = (counts * npertarget).round().astype(int)
                        else:
                            npl = npertarget

                        # finally select the patterns
                        finalized_datasets.append(random_samples(ds, npl))

                if self._reverse:
                    yield finalized_datasets[::-1]
                else:
                    yield finalized_datasets
Пример #4
0
    def test_cper_class(self, clf):
        if not (clf.params.has_key('C')):
            # skip those without C
            return

        ds = datasets['uni2medium'].copy()
        ds__ = datasets['uni2medium'].copy()
        #
        # ballanced set
        # Lets add a bit of noise to drive classifier nuts. same
        # should be done for disballanced set
        ds__.samples = ds__.samples + \
                       0.5 * np.random.normal(size=(ds__.samples.shape))
        #
        # disballanced set
        # lets overpopulate label 0
        times = 20
        ds_ = ds[(range(ds.nsamples) + range(ds.nsamples/2) * times)]
        ds_.samples = ds_.samples + \
                      0.5 * np.random.normal(size=(ds_.samples.shape))
        spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel
        #print ds_.targets, ds_.chunks

        cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
        # on balanced
        e = cve(ds__)
        tpr_1 = cve.ca.stats.stats["TPR"][1]

        # on disbalanced
        e = cve(ds_)
        tpr_2 =  cve.ca.stats.stats["TPR"][1]

        # Set '1 C per label'
        # recreate cvte since previous might have operated on copies
        cve = CrossValidation(clf, NFoldPartitioner(),
                                          enable_ca='stats')
        oldC = clf.params.C
        # TODO: provide clf.params.C not with a tuple but dictionary
        #       with C per label (now order is deduced in a cruel way)
        ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]])
        clf.params.C = (-1/ratio, -1*ratio)
        try:
            # on disbalanced but with balanced C
            e_ = cve(ds_)
            # reassign C
            clf.params.C = oldC
        except:
            clf.params.C = oldC
            raise
        tpr_3 = cve.ca.stats.stats["TPR"][1]

        # Actual tests
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(tpr_1 > 0.25,
                            msg="Without disballance we should have some "
                            "hits, but got TPR=%.3f" % tpr_1)

            self.failUnless(tpr_2 < 0.25,
                            msg="With disballance we should have almost no "
                            "hits for minor, but got TPR=%.3f" % tpr_2)

            self.failUnless(tpr_3 > 0.25,
                            msg="With disballanced data but ratio-based Cs "
                            "we should have some hits for minor, but got "
                            "TPR=%.3f" % tpr_3)
Пример #5
0
    def __call__(self, dataset):
        """Splits the dataset.

        This method behaves like a generator.
        """

        # local bindings to methods to gain some speedup
        ds_class = dataset.__class__

        # for each split
        cfgs = self.splitcfg(dataset)
        n_cfgs = len(cfgs)

        # Finally split the data
        for isplit, split in enumerate(cfgs):

            # determine sample sizes
            if not operator.isSequenceType(self.__npertarget) \
                   or isinstance(self.__npertarget, str):
                npertargetsplit = [self.__npertarget] * len(split)
            else:
                npertargetsplit = self.__npertarget

            # get splitted datasets
            split_ds = self.split_dataset(dataset, split)

            # do multiple post-processing runs for this split
            for run in xrange(self.__runspersplit):

                # post-process all datasets
                finalized_datasets = []

                for ds, npertarget in zip(split_ds, npertargetsplit):
                    # Set flag of dataset either this was the last split
                    # ??? per our discussion this might be the best
                    #     solution which would scale if we care about
                    #     thread-safety etc
                    if ds is not None:
                        ds_a = ds.a
                        lastsplit = (isplit == n_cfgs-1)
                        if not ds_a.has_key('lastsplit'):
                            # if not yet known -- add one
                            ds_a['lastsplit'] = lastsplit
                        else:
                            # otherwise just assign a new value
                            ds_a.lastsplit = lastsplit
                    # permute the labels
                    if self.__permute_attr is not None:
                        permute_attr(ds,
                                            attr=self.__permute_attr,
                                            chunks_attr='chunks', col='sa')

                    # select subset of samples if requested
                    if npertarget == 'all' or ds is None:
                        finalized_datasets.append(ds)
                    else:
                        # We need to select a subset of samples
                        # TODO: move all this logic within random_sample

                        # go for maximum possible number of samples provided
                        # by each label in this dataset
                        if npertarget == 'equal':
                            # determine the min number of samples per class
                            npl = np.array(get_nsamples_per_attr(
                                ds, 'targets').values()).min()
                        elif isinstance(npertarget, float) or (
                            operator.isSequenceType(npertarget) and
                            len(npertarget) > 0 and
                            isinstance(npertarget[0], float)):
                            # determine number of samples per class and take
                            # a ratio
                            counts = np.array(get_nsamples_per_attr(
                                ds, 'targets').values())
                            npl = (counts * npertarget).round().astype(int)
                        else:
                            npl = npertarget

                        # finally select the patterns
                        finalized_datasets.append(
                            random_samples(ds, npl))

                if self._reverse:
                    yield finalized_datasets[::-1]
                else:
                    yield finalized_datasets
Пример #6
0
    def test_cper_class(self, clf):
        if not (clf.params.has_key('C')):
            # skip those without C
            return

        ds = datasets['uni2medium'].copy()
        ds__ = datasets['uni2medium'].copy()
        #
        # ballanced set
        # Lets add a bit of noise to drive classifier nuts. same
        # should be done for disballanced set
        ds__.samples = ds__.samples + \
                       0.5 * np.random.normal(size=(ds__.samples.shape))
        #
        # disballanced set
        # lets overpopulate label 0
        times = 20
        ds_ = ds[(range(ds.nsamples) + range(ds.nsamples/2) * times)]
        ds_.samples = ds_.samples + \
                      0.5 * np.random.normal(size=(ds_.samples.shape))
        spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel
        #print ds_.targets, ds_.chunks

        cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(),
                                          enable_ca='confusion')
        # on balanced
        e = cve(ds__)
        tpr_1 = cve.ca.confusion.stats["TPR"][1]

        # on disbalanced
        e = cve(ds_)
        tpr_2 =  cve.ca.confusion.stats["TPR"][1]

        # Set '1 C per label'
        # recreate cvte since previous might have operated on copies
        cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(),
                                          enable_ca='confusion')
        oldC = clf.params.C
        # TODO: provide clf.params.C not with a tuple but dictionary
        #       with C per label (now order is deduced in a cruel way)
        ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]])
        clf.params.C = (-1/ratio, -1*ratio)
        try:
            # on disbalanced but with balanced C
            e_ = cve(ds_)
            # reassign C
            clf.params.C = oldC
        except:
            clf.params.C = oldC
            raise
        tpr_3 = cve.ca.confusion.stats["TPR"][1]

        # Actual tests
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(tpr_1 > 0.25,
                            msg="Without disballance we should have some "
                            "hits, but got TPR=%.3f" % tpr_1)

            self.failUnless(tpr_2 < 0.25,
                            msg="With disballance we should have almost no "
                            "hits for minor, but got TPR=%.3f" % tpr_2)

            self.failUnless(tpr_3 > 0.25,
                            msg="With disballanced data but ratio-based Cs "
                            "we should have some hits for minor, but got "
                            "TPR=%.3f" % tpr_3)