示例#1
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v, doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         self.failUnlessEqual(sv.enabled, sv_dc.enabled)
         self.failUnlessEqual(sv.name, sv_dc.name)
         self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
示例#2
0
 def test_deep_copying_state_variable(self):
     for v in (True, False):
         sv = ConditionalAttribute(enabled=v,
                            doc="Testing")
         sv.enabled = not v
         sv_dc = copy.deepcopy(sv)
         self.failUnlessEqual(sv.enabled, sv_dc.enabled)
         self.failUnlessEqual(sv.name, sv_dc.name)
         self.failUnlessEqual(sv._instance_index, sv_dc._instance_index)
示例#3
0
class TestClassParametrized(TestClassProper, ClassWithCollections):
    p1 = Parameter(0)
    state0 = ConditionalAttribute(enabled=False)

    def __init__(self, **kwargs):
        # XXX make such example when we actually need to invoke
        # constructor
        # TestClassProper.__init__(self, **kwargs)
        ClassWithCollections.__init__(self, **kwargs)
示例#4
0
class FeatureSelection(ClassWithCollections):
    """Base class for any feature selection

    Base class for Functors which implement feature selection on the
    datasets.
    """

    selected_ids = ConditionalAttribute(enabled=False)

    def __init__(self, **kwargs):
        # base init first
        ClassWithCollections.__init__(self, **kwargs)

    def __call__(self, dataset, testdataset=None):
        """Invocation of the feature selection

        Parameters
        ----------
        dataset : Dataset
          dataset used to select features
        testdataset : Dataset
          dataset the might be used to compute a stopping criterion

        Returns
        -------
        Dataset or tuple
          The dataset contains the selected features. If a ``testdataset`` has
          been passed a tuple with both processed datasets is return instead.
          Note that the resulting dataset(s) reference the same values for samples
          attributes (e.g. labels and chunks) of the input dataset(s): be careful
          if you alter them later.
        """
        # Derived classes must provide interface to access other
        # relevant to the feature selection process information (e.g. mask,
        # elimination step (in RFE), etc)
        results = self._call(dataset, testdataset)
        if testdataset is None:
            return results[0]
        else:
            return results

    def untrain(self):
        """ 'Untrain' feature selection

        Necessary for full 'untraining' of the classifiers. By default
        does nothing, needs to be overridden in corresponding feature
        selections to pass to the sensitivities
        """
        pass
示例#5
0
class GPRLinearWeights(Sensitivity):
    """`SensitivityAnalyzer` that reports the weights GPR trained
    on a given `Dataset`.

    In case of LinearKernel compute explicitly the coefficients
    of the linear regression, together with their variances (if
    requested).

    Note that the intercept is not computed.
    """

    variances = ConditionalAttribute(
        enabled=False,
        doc="Variances of the weights (for GeneralizedLinearKernel)")

    _LEGAL_CLFS = [GPR]

    def _call(self, dataset):
        """Extract weights from GPR
        """

        clf = self.clf
        kernel = clf.kernel
        train_fv = clf._train_fv
        if isinstance(kernel, LinearKernel):
            Sigma_p = 1.0
        else:
            Sigma_p = kernel.params.Sigma_p

        weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha))

        if self.ca.is_enabled('variances'):
            # super ugly formulas that can be quite surely improved:
            tmp = np.linalg.inv(clf._L)
            Kyinv = Ndot(tmp.T, tmp)
            # XXX in such lengthy matrix manipulations you might better off
            #     using np.matrix where * is a matrix product
            self.ca.variances = Ndiag(
                Sigma_p -
                Ndot(Sigma_p,
                     Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p)))))
        return Dataset(np.atleast_2d(weights))
示例#6
0
class SMLRWeights(Sensitivity):
    """`SensitivityAnalyzer` that reports the weights SMLR trained
    on a given `Dataset`.

    By default SMLR provides multiple weights per feature (one per label in
    training dataset). By default, all weights are combined into a single
    sensitivity value. Please, see the `FeaturewiseDatasetMeasure` constructor
    arguments how to custmize this behavior.
    """

    biases = ConditionalAttribute(enabled=True, doc="A 1-d ndarray of biases")

    _LEGAL_CLFS = [SMLR]

    def _call(self, dataset=None):
        """Extract weights from SMLR classifier.

        SMLR always has weights available, so nothing has to be computed here.
        """
        clf = self.clf
        # transpose to have the number of features on the second axis
        # (as usual)
        weights = clf.weights.T

        if clf.params.has_bias:
            self.ca.biases = clf.biases

        if __debug__:
            debug('SMLR',
                  "Extracting weights for %d-class SMLR" %
                  (len(weights) + 1) +
                  "Result: min=%f max=%f" %\
                  (np.min(weights), np.max(weights)))

        # limit the labels to the number of sensitivity sets, to deal
        # with the case of `fit_all_weights=False`
        return Dataset(
            weights, sa={clf.params.targets_attr: clf._ulabels[:len(weights)]})
示例#7
0
class ElementSelector(ClassWithCollections):
    """Base class to implement functors to select some elements based on a
    sequence of values.
    """

    ndiscarded = ConditionalAttribute(enabled=True,
        doc="Store number of discarded elements.")

    def __init__(self, mode='discard', **kwargs):
        """
        Parameters
        ----------
         mode : {'discard', 'select'}
            Decides whether to `select` or to `discard` features.
        """
        ClassWithCollections.__init__(self, **kwargs)

        self._set_mode(mode)
        """Flag whether to select or to discard elements."""


    ##REF: Name was automagically refactored
    def _set_mode(self, mode):
        """Choose `select` or `discard` mode."""

        if not mode in ['discard', 'select']:
            raise ValueError, "Unkown selection mode [%s]. Can only be one " \
                              "of 'select' or 'discard'." % mode

        self.__mode = mode


    def __call__(self, seq):
        """
        Parameters
        ----------
        seq
           Sequence based on values of which to perform the selection.
           If `Dataset`, then only 1st sample is taken.
        """
        if isinstance(seq, AttrDataset):
            if len(seq)>1:
                raise ValueError(
                    "Feature selectors cannot handle multiple "
                    "sequences in a Dataset at once.  We got dataset %s "
                    "as input."
                    % (seq,))
            seq = seq.samples[0]
        elif hasattr(seq, 'shape'):
            shape = seq.shape
            if len(shape) > 1:
                raise ValueError(
                    "Feature selectors cannot handle multidimensional "
                    "inputs (such as ndarrays with more than a single "
                    "dimension.  We got %s with shape %s "
                    "as input." % (seq.__class__, shape))
        return self._call(seq)

    def _call(self, seq):
        """Implementations in derived classed have to return a list of selected
        element IDs based on the given sequence.
        """
        raise NotImplementedError

    mode = property(fget=lambda self:self.__mode, fset=_set_mode)
示例#8
0
class BLR(Classifier):
    """Bayesian Linear Regression (BLR).

    """

    predicted_variances = ConditionalAttribute(
        enabled=False, doc="Variance per each predicted value")

    log_marginal_likelihood = ConditionalAttribute(
        enabled=False, doc="Log Marginal Likelihood")

    __tags__ = ['blr', 'regression', 'linear']

    def __init__(self, sigma_p=None, sigma_noise=1.0, **kwargs):
        """Initialize a BLR regression analysis.

        Parameters
        ----------
        sigma_noise : float
          the standard deviation of the gaussian noise.
          (Defaults to 0.1)

        """
        # init base class first
        Classifier.__init__(self, **kwargs)

        # pylint happiness
        self.w = None

        # It does not make sense to calculate a confusion matrix for a
        # BLR:
        self.ca.enable('training_confusion', False)

        # set the prior on w: N(0,sigma_p) , specifying the covariance
        # sigma_p on w:
        self.sigma_p = sigma_p

        # set noise level:
        self.sigma_noise = sigma_noise

        self.ca.predicted_variances = None
        self.ca.log_marginal_likelihood = None
        # Yarik: what was those about??? just for future in
        #        compute_log_marginal_likelihood ?
        # self.targets = None
        pass

    def __repr__(self):
        """String summary of the object
        """
        return """BLR(w=%s, sigma_p=%s, sigma_noise=%f, enable_ca=%s)""" % \
               (self.w, self.sigma_p, self.sigma_noise, str(self.ca.enabled))

    def compute_log_marginal_likelihood(self):
        """
        Compute log marginal likelihood using self.train_fv and self.targets.
        """
        # log_marginal_likelihood = None
        # return log_marginal_likelihood
        raise NotImplementedError

    def _train(self, data):
        """Train regression using `data` (`Dataset`).
        """
        # BLR relies on numerical labels
        train_labels = self._attrmap.to_numeric(
            data.sa[self.params.targets_attr].value)
        # provide a basic (i.e. identity matrix) and correct prior
        # sigma_p, if not provided before or not compliant to 'data':
        if self.sigma_p == None:  # case: not provided
            self.sigma_p = np.eye(data.samples.shape[1] + 1)
        elif self.sigma_p.shape[1] != (data.samples.shape[1] +
                                       1):  # case: wrong dimensions
            self.sigma_p = np.eye(data.samples.shape[1] + 1)
        else:
            # ...then everything is OK :)
            pass

        # add one fake column of '1.0' to model the intercept:
        self.samples_train = np.hstack(
            [data.samples, np.ones((data.samples.shape[0], 1))])
        if type(self.sigma_p) != type(
                self.samples_train):  # if sigma_p is a number...
            self.sigma_p = np.eye(self.samples_train.shape[1]
                                  ) * self.sigma_p  # convert in matrix
            pass

        self.A_inv = np.linalg.inv(
            1.0 / (self.sigma_noise**2) *
            np.dot(self.samples_train.T, self.samples_train) +
            np.linalg.inv(self.sigma_p))
        self.w = 1.0 / (self.sigma_noise**2) * np.dot(
            self.A_inv, np.dot(self.samples_train.T, train_labels))
        pass

    @accepts_dataset_as_samples
    def _predict(self, data):
        """
        Predict the output for the provided data.
        """

        data = np.hstack([data, np.ones((data.shape[0], 1), dtype=data.dtype)])
        predictions = np.dot(data, self.w)

        if self.ca.is_enabled('predicted_variances'):
            # do computation only if conditional attribute was enabled
            self.ca.predicted_variances = np.dot(
                data, np.dot(self.A_inv, data.T)).diagonal()[:, np.newaxis]
        self.ca.estimates = predictions
        return predictions

    def set_hyperparameters(self, *args):
        """
        Set hyperparameters' values.

        Note that this is a list so the order of the values is
        important.
        """
        args = args[0]
        self.sigma_noise = args[0]
        if len(args) > 1:
            self.sigma_p = np.array(args[1:])  # XXX check if this is ok
            pass
        return

    pass
示例#9
0
class CrossValidatedTransferError(DatasetMeasure, Harvestable):
    """Classifier cross-validation.

    This class provides a simple interface to cross-validate a classifier
    on datasets generated by a splitter from a single source dataset.

    Arbitrary performance/error values can be computed by specifying an error
    function (used to compute an error value for each cross-validation fold)
    and a combiner function that aggregates all computed error values across
    cross-validation folds.
    """

    results = ConditionalAttribute(
        enabled=False, doc="""Store individual results in the state""")
    splits = ConditionalAttribute(
        enabled=False,
        doc="""Store the actual splits of the data. Can be memory expensive""")
    transerrors = ConditionalAttribute(
        enabled=False,
        doc="""Store copies of transerrors at each step. If enabled -
       operates on clones of transerror, but for the last split original
       transerror is used""")
    confusion = ConditionalAttribute(
        enabled=False, doc="""Store total confusion matrix (if available)""")
    training_confusion = ConditionalAttribute(
        enabled=False,
        doc="""Store total training confusion matrix (if available)""")
    samples_error = ConditionalAttribute(enabled=False,
                                         doc="Per sample errors.")

    def __init__(self,
                 transerror,
                 splitter=None,
                 expose_testdataset=False,
                 harvest_attribs=None,
                 copy_attribs='copy',
                 samples_idattr='origids',
                 **kwargs):
        """
        Parameters
        ----------
        transerror : TransferError instance
          Provides the classifier used for cross-validation.
        splitter : Splitter or None
          Used to split the dataset for cross-validation folds. By
          convention the first dataset in the tuple returned by the
          splitter is used to train the provided classifier. If the
          first element is 'None' no training is performed. The second
          dataset is used to generate predictions with the (trained)
          classifier. If `None` (default) an instance of
          :class:`~mvpa.datasets.splitters.NoneSplitter` is used.
        expose_testdataset : bool, optional
          In the proper pipeline, classifier must not know anything
          about testing data, but in some cases it might lead only
          to marginal harm, thus migth wanted to be enabled (provide
          testdataset for RFE to determine stopping point).
        harvest_attribs : list of str
          What attributes of call to store and return within
          harvested conditional attribute
        copy_attribs : None or str, optional
          Force copying values of attributes on harvesting
        samples_idattr : str, optional
          What samples attribute to use to identify and store samples_errors
          conditional attribute
        **kwargs
          All additional arguments are passed to the
          :class:`~mvpa.measures.base.DatasetMeasure` base class.
        """
        DatasetMeasure.__init__(self, **kwargs)
        Harvestable.__init__(self, harvest_attribs, copy_attribs)

        if splitter is None:
            self.__splitter = NoneSplitter()
        else:
            self.__splitter = splitter

        self.__transerror = transerror
        self.__expose_testdataset = expose_testdataset
        self.__samples_idattr = samples_idattr

# TODO: put back in ASAP
#    def __repr__(self):
#        """String summary over the object
#        """
#        return """CrossValidatedTransferError /
# splitter: %s
# classifier: %s
# errorfx: %s
# combiner: %s""" % (indent_doc(self.__splitter), indent_doc(self.__clf),
#                      indent_doc(self.__errorfx), indent_doc(self.__combiner))

    def _call(self, dataset):
        """Perform cross-validation on a dataset.

        'dataset' is passed to the splitter instance and serves as the source
        dataset to generate split for the single cross-validation folds.
        """
        # store the results of the splitprocessor
        results = []
        self.ca.splits = []

        # local bindings
        ca = self.ca
        clf = self.__transerror.clf
        expose_testdataset = self.__expose_testdataset

        # what ca to enable in terr
        terr_enable = []
        for state_var in ['confusion', 'training_confusion', 'samples_error']:
            if ca.is_enabled(state_var):
                terr_enable += [state_var]

        # charge ca with initial values
        summaryClass = clf.__summary_class__
        clf_hastestdataset = hasattr(clf, 'testdataset')

        self.ca.confusion = summaryClass()
        self.ca.training_confusion = summaryClass()
        self.ca.transerrors = []
        if ca.is_enabled('samples_error'):
            dataset.init_origids('samples',
                                 attr=self.__samples_idattr,
                                 mode='existing')
            self.ca.samples_error = dict([
                (id_, []) for id_ in dataset.sa[self.__samples_idattr].value
            ])

        # enable requested ca in child TransferError instance (restored
        # again below)
        if len(terr_enable):
            self.__transerror.ca.change_temporarily(enable_ca=terr_enable)

        # We better ensure that underlying classifier is not trained if we
        # are going to deepcopy transerror
        if ca.is_enabled("transerrors"):
            self.__transerror.untrain()

        # collect sum info about the split that where made for the resulting
        # dataset
        splitinfo = []

        # splitter
        for split in self.__splitter(dataset):
            splitinfo.append("%s->%s" % (','.join([
                str(c) for c in split[0].sa[self.__splitter.splitattr].unique
            ]), ','.join([
                str(c) for c in split[1].sa[self.__splitter.splitattr].unique
            ])))

            # only train classifier if splitter provides something in first
            # element of tuple -- the is the behavior of TransferError
            if ca.is_enabled("splits"):
                self.ca.splits.append(split)

            if ca.is_enabled("transerrors"):
                # copy first and then train, as some classifiers cannot be copied
                # when already trained, e.g. SWIG'ed stuff
                lastsplit = None
                for ds in split:
                    if ds is not None:
                        lastsplit = ds.a.lastsplit
                        break
                if lastsplit:
                    # only if we could deduce that it was last split
                    # use the 'mother' transerror
                    transerror = self.__transerror
                else:
                    # otherwise -- deep copy
                    transerror = deepcopy(self.__transerror)
            else:
                transerror = self.__transerror

            # assign testing dataset if given classifier can digest it
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = split[1]

            # run the beast
            result = transerror(split[1], split[0])

            # unbind the testdataset from the classifier
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = None

            # next line is important for 'self._harvest' call
            self._harvest(locals())

            # XXX Look below -- may be we should have not auto added .?
            #     then transerrors also could be deprecated
            if ca.is_enabled("transerrors"):
                self.ca.transerrors.append(transerror)

            # XXX: could be merged with next for loop using a utility class
            # that can add dict elements into a list
            if ca.is_enabled("samples_error"):
                for k, v in \
                  transerror.ca.samples_error.iteritems():
                    self.ca.samples_error[k].append(v)

            # pull in child ca
            for state_var in ['confusion', 'training_confusion']:
                if ca.is_enabled(state_var):
                    ca[state_var].value.__iadd__(
                        transerror.ca[state_var].value)

            if __debug__:
                debug("CROSSC", "Split #%d: result %s" \
                      % (len(results), `result`))
            results.append(result)

        # Since we could have operated with a copy -- bind the last used one back
        self.__transerror = transerror

        # put ca of child TransferError back into original config
        if len(terr_enable):
            self.__transerror.ca.reset_changed_temporarily()

        self.ca.results = results
        """Store conditional attribute if it is enabled"""
        results = Dataset(results, sa={'cv_fold': splitinfo})
        return results

    splitter = property(fget=lambda self: self.__splitter,
                        doc="Access to the Splitter instance.")
    transerror = property(fget=lambda self: self.__transerror,
                          doc="Access to the TransferError instance.")
示例#10
0
class SVM(_SVM):
    """Support Vector Machine Classifier.

    This is a simple interface to the libSVM package.
    """

    # Since this is internal feature of LibSVM, this conditional attribute is present
    # here
    probabilities = ConditionalAttribute(
        enabled=False,
        doc="Estimates of samples probabilities as provided by LibSVM")

    # TODO p is specific for SVR
    _KNOWN_PARAMS = [
        'epsilon', 'probability', 'shrinking', 'weight_label', 'weight'
    ]

    #_KNOWN_KERNEL_PARAMS = [ 'cache_size' ]

    _KNOWN_SENSITIVITIES = {
        'linear': LinearSVMWeights,
    }
    _KNOWN_IMPLEMENTATIONS = {
        'C_SVC': (_svm.svmc.C_SVC, ('C', ), ('binary', 'multiclass'),
                  'C-SVM classification'),
        'NU_SVC': (_svm.svmc.NU_SVC, ('nu', ), ('binary', 'multiclass'),
                   'nu-SVM classification'),
        'ONE_CLASS':
        (_svm.svmc.ONE_CLASS, (), ('oneclass', ), 'one-class-SVM'),
        'EPSILON_SVR': (_svm.svmc.EPSILON_SVR, ('C', 'tube_epsilon'),
                        ('regression', ), 'epsilon-SVM regression'),
        'NU_SVR': (_svm.svmc.NU_SVR, ('nu', 'tube_epsilon'), ('regression', ),
                   'nu-SVM regression')
    }

    __default_kernel_class__ = LinearLSKernel
    __tags__ = _SVM.__tags__ + ['libsvm']

    def __init__(self, **kwargs):
        # XXX Determine which parameters depend on each other and implement
        # safety/simplifying logic around them
        # already done for: nr_weight
        # thought: weight and weight_label should be a dict
        """Interface class to LIBSVM classifiers and regressions.

        Default implementation (C/nu/epsilon SVM) is chosen depending
        on the given parameters (C/nu/tube_epsilon).
        """

        svm_impl = kwargs.get('svm_impl', None)
        # Depending on given arguments, figure out desired SVM
        # implementation
        if svm_impl is None:
            for arg, impl in [('tube_epsilon', 'EPSILON_SVR'), ('C', 'C_SVC'),
                              ('nu', 'NU_SVC')]:
                if kwargs.has_key(arg):
                    svm_impl = impl
                    if __debug__:
                        debug(
                            'SVM', 'No implementation was specified. Since '
                            '%s is given among arguments, assume %s' %
                            (arg, impl))
                    break
            if svm_impl is None:
                svm_impl = 'C_SVC'
                if __debug__:
                    debug('SVM', 'Assign C_SVC "by default"')
        kwargs['svm_impl'] = svm_impl

        # init base class
        _SVM.__init__(self, **kwargs)

        self._svm_type = self._KNOWN_IMPLEMENTATIONS[svm_impl][0]

        if 'nu' in self._KNOWN_PARAMS and 'epsilon' in self._KNOWN_PARAMS:
            # overwrite eps param with new default value (information
            # taken from libSVM docs
            self.params['epsilon']._set_default(0.001)

        self.__model = None
        """Holds the trained SVM."""

    def _train(self, dataset):
        """Train SVM
        """
        targets_sa_name = self.params.targets_attr  # name of targets sa
        targets_sa = dataset.sa[targets_sa_name]  # actual targets sa

        # libsvm needs doubles
        src = _data2ls(dataset)

        # libsvm cannot handle literal labels
        labels = self._attrmap.to_numeric(targets_sa.value).tolist()

        svmprob = _svm.SVMProblem(labels, src)

        # Translate few params
        TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'}
        args = []
        for paramname, param in self.params.items() \
                + self.kernel_params.items():
            if paramname in TRANSLATEDICT:
                argname = TRANSLATEDICT[paramname]
            elif paramname in _svm.SVMParameter.default_parameters:
                argname = paramname
            else:
                if __debug__:
                    debug(
                        "SVM_", "Skipping parameter %s since it is not known"
                        "to libsvm" % paramname)
                continue
            args.append((argname, param.value))

        # ??? All those parameters should be fetched if present from
        # **kwargs and create appropriate parameters within .params or
        # .kernel_params
        libsvm_param = _svm.SVMParameter(
            kernel_type=self.params.kernel.as_raw_ls(),  # Just an integer ID
            svm_type=self._svm_type,
            **dict(args))
        """Store SVM parameters in libSVM compatible format."""

        if self.params.has_key('C'):  #svm_type in [_svm.svmc.C_SVC]:
            Cs = self._get_cvec(dataset)
            if len(Cs) > 1:
                C0 = abs(Cs[0])
                scale = 1.0 / (C0)  #*np.sqrt(C0))
                # so we got 1 C per label
                uls = self._attrmap.to_numeric(targets_sa.unique)
                if len(Cs) != len(uls):
                    raise ValueError, "SVM was parameterized with %d Cs but " \
                          "there are %d labels in the dataset" % \
                          (len(Cs), len(targets_sa.unique))
                weight = [c * scale for c in Cs]
                # All 3 need to be set to take an effect
                libsvm_param._set_parameter('weight', weight)
                libsvm_param._set_parameter('nr_weight', len(weight))
                libsvm_param._set_parameter('weight_label', uls)
            libsvm_param._set_parameter('C', Cs[0])

        self.__model = _svm.SVMModel(svmprob, libsvm_param)

    @accepts_samples_as_dataset
    def _predict(self, data):
        """Predict values for the data
        """
        # libsvm needs doubles
        src = _data2ls(data)
        ca = self.ca

        predictions = [self.model.predict(p) for p in src]

        if ca.is_enabled('estimates'):
            if self.__is_regression__:
                estimates = [self.model.predict_values_raw(p)[0] for p in src]
            else:
                # if 'trained_targets' are literal they have to be mapped
                if np.issubdtype(self.ca.trained_targets.dtype, 'c'):
                    trained_targets = self._attrmap.to_numeric(
                        self.ca.trained_targets)
                else:
                    trained_targets = self.ca.trained_targets
                nlabels = len(trained_targets)
                # XXX We do duplicate work. model.predict calls
                # predict_values_raw internally and then does voting or
                # thresholding. So if speed becomes a factor we might
                # want to move out logic from libsvm over here to base
                # predictions on obtined values, or adjust libsvm to
                # spit out values from predict() as well
                if nlabels == 2:
                    # Apperently libsvm reorders labels so we need to
                    # track (1,0) values instead of (0,1) thus just
                    # lets take negative reverse
                    estimates = [
                        self.model.predict_values(p)[(trained_targets[1],
                                                      trained_targets[0])]
                        for p in src
                    ]
                    if len(estimates) > 0:
                        if __debug__:
                            debug(
                                "SVM",
                                "Forcing estimates to be ndarray and reshaping"
                                " them into 1D vector")
                        estimates = np.asarray(estimates).reshape(
                            len(estimates))
                else:
                    # In multiclass we return dictionary for all pairs
                    # of labels, since libsvm does 1-vs-1 pairs
                    estimates = [self.model.predict_values(p) for p in src]
            ca.estimates = estimates

        if ca.is_enabled("probabilities"):
            # XXX Is this really necesssary? yoh don't think so since
            # assignment to ca is doing the same
            #self.probabilities = [ self.model.predict_probability(p)
            #                       for p in src ]
            try:
                ca.probabilities = [
                    self.model.predict_probability(p) for p in src
                ]
            except TypeError:
                warning("Current SVM %s doesn't support probability " % self +
                        " estimation.")
        return predictions

    def summary(self):
        """Provide quick summary over the SVM classifier"""
        s = super(SVM, self).summary()
        if self.trained:
            s += '\n # of SVs: %d' % self.__model.get_total_n_sv()
            try:
                prm = _svm.svmc.svm_model_param_get(self.__model.model)
                C = _svm.svmc.svm_parameter_C_get(prm)
                # extract information of how many SVs sit inside the margin,
                # i.e. so called 'bounded SVs'
                inside_margin = np.sum(
                    # take 0.99 to avoid rounding issues
                    np.abs(self.__model.get_sv_coef()) >= 0.99 *
                    _svm.svmc.svm_parameter_C_get(prm))
                s += ' #bounded SVs:%d' % inside_margin
                s += ' used C:%5g' % C
            except:
                pass
        return s

    def untrain(self):
        """Untrain libsvm's SVM: forget the model
        """
        if __debug__ and "SVM" in debug.active:
            debug("SVM", "Untraining %s and destroying libsvm model" % self)
        super(SVM, self).untrain()
        del self.__model
        self.__model = None

    model = property(fget=lambda self: self.__model)
    """Access to the SVM model."""
示例#11
0
class FeatureSelectionPipeline(FeatureSelection):
    """Feature elimination through the list of FeatureSelection's.

    Given as list of FeatureSelections it applies them in turn.
    """

    nfeatures = ConditionalAttribute(
        doc="Number of features before each step in pipeline")

    # TODO: may be we should also append resultant number of features?

    def __init__(self, feature_selections, **kwargs):
        """Initialize feature selection pipeline

        Parameters
        ----------
        feature_selections : lisf of FeatureSelection
          selections which to use. Order matters
        """
        # base init first
        FeatureSelection.__init__(self, **kwargs)

        self.__feature_selections = feature_selections
        """Selectors to use in turn"""

    def untrain(self):
        if __debug__:
            debug("FS_", "Untraining FS pipeline: %s" % self)
        for fs in self.__feature_selections:
            fs.untrain()

    def _call(self, dataset, testdataset=None, **kwargs):
        """Invocation of the feature selection
        """
        wdataset = dataset
        wtestdataset = testdataset

        self.ca.selected_ids = None

        self.ca.nfeatures = []
        """Number of features at each step (before running selection)"""

        for fs in self.__feature_selections:

            # enable selected_ids state if it was requested from this class
            fs.ca.change_temporarily(enable_ca=["selected_ids"], other=self)
            if self.ca.is_enabled("nfeatures"):
                self.ca.nfeatures.append(wdataset.nfeatures)

            if __debug__:
                debug('FSPL',
                      'Invoking %s on (%s, %s)' % (fs, wdataset, wtestdataset))
            wdataset, wtestdataset = fs(wdataset, wtestdataset, **kwargs)

            if self.ca.is_enabled("selected_ids"):
                if self.ca.selected_ids == None:
                    self.ca.selected_ids = fs.ca.selected_ids
                else:
                    self.ca.selected_ids = self.ca.selected_ids[
                        fs.ca.selected_ids]

            fs.ca.reset_changed_temporarily()

        return (wdataset, wtestdataset)

    feature_selections = property(fget=lambda self: self.__feature_selections,
                                  doc="List of `FeatureSelections`")
示例#12
0
    def _set_retrainable(self, value, force=False):
        """Assign value of retrainable parameter

        If retrainable flag is to be changed, classifier has to be
        untrained.  Also internal attributes such as _changedData,
        __changedData_isset, and __idhashes should be initialized if
        it becomes retrainable
        """
        pretrainable = self.params['retrainable']
        if (force or value != pretrainable.value) \
               and 'retrainable' in self.__tags__:
            if __debug__:
                debug("CLF_", "Setting retrainable to %s" % value)
            if 'meta' in self.__tags__:
                warning("Retrainability is not yet crafted/tested for "
                        "meta classifiers. Unpredictable behavior might occur")
            # assure that we don't drag anything behind
            if self.trained:
                self.untrain()
            ca = self.ca
            if not value and ca.has_key('retrained'):
                ca.pop('retrained')
                ca.pop('repredicted')
            if value:
                if not 'retrainable' in self.__tags__:
                    warning(
                        "Setting of flag retrainable for %s has no effect"
                        " since classifier has no such capability. It would"
                        " just lead to resources consumption and slowdown" %
                        self)
                ca['retrained'] = ConditionalAttribute(
                    enabled=True,
                    doc="Either retrainable classifier was retrained")
                ca['repredicted'] = ConditionalAttribute(
                    enabled=True,
                    doc="Either retrainable classifier was repredicted")

            pretrainable.value = value

            # if retrainable we need to keep track of things
            if value:
                self.__idhashes = {
                    'traindata': None,
                    'targets': None,
                    'testdata': None
                }  #, 'testtraindata': None}
                if __debug__ and 'CHECK_RETRAIN' in debug.active:
                    # ??? it is not clear though if idhash is faster than
                    # simple comparison of (dataset != __traineddataset).any(),
                    # but if we like to get rid of __traineddataset then we
                    # should use idhash anyways
                    self.__trained = self.__idhashes.copy()  # just same Nones
                self.__reset_changed_data()
                self.__invalidatedChangedData = {}
            elif 'retrainable' in self.__tags__:
                #self.__reset_changed_data()
                self.__changedData_isset = False
                self._changedData = None
                self.__idhashes = None
                if __debug__ and 'CHECK_RETRAIN' in debug.active:
                    self.__trained = None
示例#13
0
class MixedClass(ClassWithCollections):
    C = Parameter(1.0, min=0, doc="C parameter")
    D = Parameter(3.0, min=0, doc="D parameter")
    state1 = ConditionalAttribute(doc="bogus")
示例#14
0
class TestClassProper(ClassWithCollections):

    state1 = ConditionalAttribute(enabled=False, doc="state1 doc")
    state2 = ConditionalAttribute(enabled=True, doc="state2 doc")
示例#15
0
class MCNullDist(NullDist):
    """Null-hypothesis distribution is estimated from randomly permuted data labels.

    The distribution is estimated by calling fit() with an appropriate
    `DatasetMeasure` or `TransferError` instance and a training and a
    validation dataset (in case of a `TransferError`). For a customizable
    amount of cycles the training data labels are permuted and the
    corresponding measure computed. In case of a `TransferError` this is the
    error when predicting the *correct* labels of the validation dataset.

    The distribution can be queried using the `cdf()` method, which can be
    configured to report probabilities/frequencies from `left` or `right` tail,
    i.e. fraction of the distribution that is lower or larger than some
    critical value.

    This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
    returns an array of featurewise probabilities/frequencies.
    """

    _DEV_DOC = """
    TODO automagically decide on the number of samples/permutations needed
    Caution should be paid though since resultant distributions might be
    quite far from some conventional ones (e.g. Normal) -- it is expected to
    them to be bimodal (or actually multimodal) in many scenarios.
    """

    dist_samples = ConditionalAttribute(
        enabled=False, doc='Samples obtained for each permutation')

    # XXX shouldn't we may be RF permute_attr into a Permutator class? ;)
    def __init__(self,
                 dist_class=Nonparametric,
                 permutations=100,
                 permute_attr='targets',
                 chunks_attr=None,
                 permute_col='sa',
                 assure_permute=False,
                 **kwargs):
        """Initialize Monte-Carlo Permutation Null-hypothesis testing

        Parameters
        ----------
        dist_class : class
          This can be any class which provides parameters estimate
          using `fit()` method to initialize the instance, and
          provides `cdf(x)` method for estimating value of x in CDF.
          All distributions from SciPy's 'stats' module can be used.
        permutations : int
          This many permutations of label will be performed to
          determine the distribution under the null hypothesis.
        permute_attr : str
          Name of the samples attribute to permute. ('targets' by default)
        chunks_attr : None or str
          If not None, permutes labels within the chunks,
          i.e. blocks of data having the same value of `chunks_attr`.
        permute_col : str, optional
          What collection `permute_attr` belongs to.
        assure_permute : bool
          Passed to func:`~mvpa.datasets.misc.permute_attr`. If True,
          assures that targets are permuted, i.e. any one is different from
          the original one
        """
        NullDist.__init__(self, **kwargs)

        self._dist_class = dist_class
        self._dist = []  # actual distributions

        self.__permutations = permutations
        """Number of permutations to compute the estimate the null
        distribution."""

        self.permute_attr = permute_attr
        self.chunks_attr = chunks_attr
        self.assure_permute = assure_permute
        self.permute_col = permute_col

    def __repr__(self, prefixes=[]):
        prefixes_ = ["permutations=%s" % self.__permutations]
        if self.permute_attr != 'targets':
            prefixes_ += ['attr=%r' % self.permute_attr]
        if self.chunks_attr:
            prefixes_ += ['chunks_attr=%r' % self.chunks_attr]
        if self.permute_col != 'sa':
            prefixes_ += ['permute_col=%r' % self.permute_col]
        if self.assure_permute:
            prefixes_ += ['assure_permute=%r' % self.assure_permute]
        if self._dist_class != Nonparametric:
            prefixes_.insert(0, 'dist_class=%r' % (self._dist_class, ))
        return super(MCNullDist, self).__repr__(prefixes=prefixes_ + prefixes)

    def fit(self, measure, wdata, vdata=None):
        """Fit the distribution by performing multiple cycles which repeatedly
        permuted labels in the training dataset.

        Parameters
        ----------
        measure: (`Featurewise`)`DatasetMeasure` or `TransferError`
          TransferError instance used to compute all errors.
        wdata: `Dataset` which gets permuted and used to compute the
          measure/transfer error multiple times.
        vdata: `Dataset` used for validation.
          If provided measure is assumed to be a `TransferError` and
          working and validation dataset are passed onto it.
        """
        # TODO: place exceptions separately so we could avoid circular imports
        from mvpa.clfs.base import LearnerError

        dist_samples = []
        """Holds the values for randomized labels."""

        # estimate null-distribution
        for p in xrange(self.__permutations):
            # new permutation all the time
            # but only permute the training data and keep the testdata constant
            #
            if __debug__:
                debug('STATMC', "Doing %i permutations: %i" \
                      % (self.__permutations, p+1), cr=True)

            # TODO this really needs to be more clever! If data samples are
            # shuffled within a class it really makes no difference for the
            # classifier, hence the number of permutations to estimate the
            # null-distribution of transfer errors can be reduced dramatically
            # when the *right* permutations (the ones that matter) are done.
            permuted_wdata = wdata.copy('shallow')
            permuted_wdata.permute_attr(attr=self.permute_attr,
                                        chunks_attr=self.chunks_attr,
                                        col=self.permute_col,
                                        assure_permute=self.assure_permute)

            # decide on the arguments to measure
            if not vdata is None:
                measure_args = [vdata, permuted_wdata]
            else:
                measure_args = [permuted_wdata]

            # compute and store the measure of this permutation
            # assume it has `TransferError` interface
            try:
                res = measure(*measure_args)
            except LearnerError, e:
                warning(
                    'Failed to obtain value from %s due to %s.  Measurement'
                    ' was skipped, which could lead to unstable and/or'
                    ' incorrect assessment of the null_dist' % (measure, e))
            res = np.asanyarray(res)
            dist_samples.append(res)

        if __debug__:
            debug('STATMC', '')

        # store samples
        self.ca.dist_samples = dist_samples = np.asarray(dist_samples)

        # fit distribution per each element

        # to decide either it was done on scalars or vectors
        shape = dist_samples.shape
        nshape = len(shape)
        # if just 1 dim, original data was scalar, just create an
        # artif dimension for it
        if nshape == 1:
            dist_samples = dist_samples[:, np.newaxis]

        # fit per each element.
        # XXX could be more elegant? may be use np.vectorize?
        dist_samples_rs = dist_samples.reshape((shape[0], -1))
        dist = []
        for samples in dist_samples_rs.T:
            params = self._dist_class.fit(samples)
            if __debug__ and 'STAT' in debug.active:
                debug(
                    'STAT', 'Estimated parameters for the %s are %s' %
                    (self._dist_class, str(params)))
            dist.append(self._dist_class(*params))
        self._dist = dist
示例#16
0
 class S12(S1__, S2):
     v12 = ConditionalAttribute()
示例#17
0
 class S1__(S1_):
     v1__ = ConditionalAttribute(enabled=False)
示例#18
0
 class S2(ClassWithCollections):
     v2 = ConditionalAttribute(enabled=True, doc="values12 is ...")
示例#19
0
 class S1(ClassWithCollections):
     v1 = ConditionalAttribute(enabled=True, doc="values1 is ...")
     v1XXX = ConditionalAttribute(enabled=False, doc="values1 is ...")
示例#20
0
class GeneralizedLinearKernel(NumpyKernel):
    """The linear kernel class.
    """

    sigma_0 = Parameter(1.0,
                        doc="""
       A simple constant squared value which is broadcasted across
       kernel. In the case of GPR -- standard deviation of the Gaussian
       prior probability Normal(0, sigma_0**2) of the intercept of the
       linear regression.""")

    Sigma_p = Parameter(1.0,
                        doc=r"""
       A generic scalar or vector, or diagonal matrix to scale all
       dimensions or associate different scaling to each dimensions
       while computing te kernel matrix:
       :math:`k(x_A,x_B) = x_A^\top \Sigma_p x_B + \sigma_0^2`.
       In the case of GPR -- a scalar or a diagonal of covariance matrix
       of the Gaussian prior probability Normal(0, Sigma_p) on the weights
       of the linear regression.""")

    gradients = ConditionalAttribute(enabled=False,
        doc="Dictionary of gradients per a parameter")

    gradientslog = ConditionalAttribute(enabled=False,
        doc="Dictionary of gradients per a parameter in logspace")

    def __init__(self, *args, **kwargs):
        # for docstring holder
        NumpyKernel.__init__(self, *args, **kwargs)

    ## def __init__(self, Sigma_p=None, sigma_0=1.0, **kwargs):
    ##     """Initialize the linear kernel instance.

    ##     :Parameters:
    ##       Sigma_p : numpy.ndarray
    ##         Covariance matrix of the Gaussian prior probability N(0,Sigma_p)
    ##         on the weights of the linear regression.
    ##         (Defaults to None)
    ##       sigma_0 : float
    ##         the standard deviation of the Gaussian prior N(0,sigma_0**2)
    ##         of the intercept of the linear regression.
    ##         (Deafults to 1.0)
    ##     """
    ##     # init base class first
    ##     NumpyKernel.__init__(self, **kwargs)

    ##     # TODO: figure out cleaner way... probably by using KernelParameters ;-)
    ##     self.Sigma_p = Sigma_p
    ##     self.sigma_0 = sigma_0


    ## def __repr__(self):
    ##     return "%s(Sigma_p=%s, sigma_0=%s)" \
    ##         % (self.__class__.__name__, str(self.Sigma_p), str(self.sigma_0))

    # XXX ??? would we reset correctly to the original value... model selection
    #     currently depends on this I believe
    def reset(self):
        super(GeneralizedLinearKernel, self).reset()
        self._Sigma_p = self._Sigma_p_orig


    def _compute(self, data1, data2):
        """Compute kernel matrix.
        """
        # it is better to use separate lines of computation, to don't
        # incure computation cost without need (otherwise
        # np.dot(self.Sigma_p, data2.T) can take forever for relatively
        # large number of features)

        Sigma_p = self.params.Sigma_p          # local binding
        sigma_0 = self.params.sigma_0

        #if scalar - scale second term appropriately
        if np.isscalar(Sigma_p):
            if Sigma_p == 1.0:
                data2_sc = data2.T
            else:
                data2_sc = Sigma_p * data2.T

        # if vector use it as diagonal matrix -- ie scale each row by
        # the given value
        elif len(Sigma_p.shape) == 1 and \
                 Sigma_p.shape[0] == data2.shape[1]:
            # which due to numpy broadcasting is the same as product
            # with scalar above
            data2_sc = (Sigma_p * data2).T
        # If (diagonal) or full-matrix -- full-featured and lengthy matrix
        # product
        elif len(Sigma_p.shape) == 2 and \
                 Sigma_p.shape[0] == Sigma_p.shape[1] == data2.shape[1]:
            # which due to numpy broadcasting is the same as product
            # with scalar above
            data2_sc = np.dot(Sigma_p, data2.T)
        else:
            raise ValueError, "Please provide Sigma_p as a scalar, vector, " \
                  "or square (diagonal) matrix."

        # XXX if Sigma_p is changed a warning should be issued!
        # XXX other cases of incorrect Sigma_p could be catched
        self._k = k = np.dot(data1, data2_sc) + sigma_0 ** 2

        # Compute gradients if any was requested
        do_g  = self.ca.is_enabled('gradients')
        do_gl = self.ca.is_enabled('gradientslog')
        if do_g or do_gl:
            if np.isscalar(Sigma_p):
                g_Sigma_p = np.dot(data1, data2.T)
                gl_Sigma_p = Sigma_p * g_Sigma_p
            else:
                nfeat = len(Sigma_p)
                gsize = (len(data1), len(data2), nfeat)
                if do_g:  g_Sigma_p = np.empty(gsize)
                if do_gl: gl_Sigma_p = np.empty(gsize)
                for i in xrange(nfeat):
                    outer = np.multiply.outer(data1[:, i], data2[:, i])
                    if do_g:  g_Sigma_p[:, :, i] = outer
                    if do_gl: gl_Sigma_p = Sigma_p[i] * outer
            if do_g:
                self.ca.gradients = dict(
                    sigma_0=2*sigma_0,
                    Sigma_p=g_Sigma_p)
            if do_gl:
                self.ca.gradientslog = dict(
                    sigma_0=2*sigma_0**2,
                    Sigma_p=gl_Sigma_p)
    pass
示例#21
0
class LinearSVMWeights(Sensitivity):
    """`Sensitivity` that reports the weights of a linear SVM trained
    on a given `Dataset`.
    """

    biases = ConditionalAttribute(enabled=True,
                                  doc="Offsets of separating hyperplanes")

    def __init__(self, clf, **kwargs):
        """Initialize the analyzer with the classifier it shall use.

        Parameters
        ----------
        clf : LinearSVM
          classifier to use. Only classifiers sub-classed from
          `LinearSVM` may be used.
        """
        # init base classes first
        Sensitivity.__init__(self, clf, **kwargs)

    def __sg_helper(self, svm):
        """Helper function to compute sensitivity for a single given SVM"""
        bias = svm.get_bias()
        svcoef = np.matrix(svm.get_alphas())
        svnums = svm.get_support_vectors()
        svs = self.clf.traindataset.samples[svnums, :]
        res = (svcoef * svs).mean(axis=0).A1
        return res, bias

    def _call(self, dataset):
        # XXX Hm... it might make sense to unify access functions
        # naming across our swig libsvm wrapper and sg access
        # functions for svm
        clf = self.clf
        sgsvm = clf.svm
        sens_labels = None
        if isinstance(sgsvm, shogun.Classifier.MultiClassSVM):
            sens, biases = [], []
            nsvms = sgsvm.get_num_svms()
            clabels = sorted(clf._attrmap.values())
            nclabels = len(clabels)
            sens_labels = []
            isvm = 0  # index for svm among known

            for i in xrange(nclabels):
                for j in xrange(i + 1, nclabels):
                    sgsvmi = sgsvm.get_svm(isvm)
                    labels_tuple = (clabels[i], clabels[j])
                    # Since we gave the labels in incremental order,
                    # we always should be right - but it does not
                    # hurt to check if set of labels is the same
                    if __debug__ and _shogun_exposes_slavesvm_labels:
                        if not sgsvmi.get_labels():
                            # We need to call classify() so labels get assigned
                            # to the multiclass SVM
                            sgsvm.classify()
                        assert (set([
                            sgsvmi.get_label(int(x))
                            for x in sgsvmi.get_support_vectors()
                        ]) == set(labels_tuple))
                    sens1, bias = self.__sg_helper(sgsvmi)
                    sens.append(sens1)
                    biases.append(bias)
                    sens_labels += [labels_tuple[::-1]]  # ??? positive first
                    isvm += 1
            assert (len(sens) == nsvms)  # we should have  covered all
        else:
            sens1, bias = self.__sg_helper(sgsvm)
            biases = np.atleast_1d(bias)
            sens = np.atleast_2d(sens1)
            if not clf.__is_regression__:
                assert (set(clf._attrmap.values()) == set([-1.0, 1.0]))
                assert (sens.shape[0] == 1)
                sens_labels = [(-1.0, 1.0)]

        ds = Dataset(np.atleast_2d(sens))
        if sens_labels is not None:
            if isinstance(sens_labels[0], tuple):
                # Need to have them in array of dtype object
                sens_labels = asobjarray(sens_labels)

            if len(clf._attrmap):
                sens_labels = clf._attrmap.to_literal(sens_labels,
                                                      recurse=True)
            ds.sa[clf.params.targets_attr] = sens_labels
        self.ca.biases = biases

        return ds
示例#22
0
class TestClassProperChild(TestClassProper):

    state4 = ConditionalAttribute(enabled=False, doc="state4 doc")
示例#23
0
class RFE(FeatureSelection):
    """Recursive feature elimination.

    A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a
    certain dataset. These sensitivity maps are in turn used to discard
    unimportant features. For each feature selection the transfer error on some
    testdatset is computed. This procedure is repeated until a given
    `StoppingCriterion` is reached.

    References
    ----------
    Such strategy after
      Guyon, I., Weston, J., Barnhill, S., & Vapnik, V. (2002). Gene
      selection for cancer classification using support vector
      machines. Mach. Learn., 46(1-3), 389--422.
    was applied to SVM-based analysis of fMRI data in
      Hanson, S. J. & Halchenko, Y. O. (2008). Brain reading using
      full brain support vector machines for object recognition:
      there is no "face identification area". Neural Computation, 20,
      486--503.
    """

    errors = ConditionalAttribute(
        doc="History of errors through RFE")
    nfeatures = ConditionalAttribute(
        doc="History of # of features left")
    history = ConditionalAttribute(
        doc="Last step # when each feature was still present")
    sensitivities = ConditionalAttribute(enabled=False,
        doc="History of sensitivities (might consume too much memory")

    def __init__(self,
                 sensitivity_analyzer,
                 transfer_error,
                 feature_selector=FractionTailSelector(0.05),
                 bestdetector=BestDetector(),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector()),
                 train_clf=None,
                 update_sensitivity=True,
                 **kargs
                 ):
        # XXX Allow for multiple stopping criterions, e.g. error not decreasing
        # anymore OR number of features less than threshold
        """Initialize recursive feature elimination

        Parameters
        ----------
        sensitivity_analyzer : FeaturewiseDatasetMeasure object
        transfer_error : TransferError object
          used to compute the transfer error of a classifier based on a
          certain feature set on the test dataset.
          NOTE: If sensitivity analyzer is based on the same
          classifier as transfer_error is using, make sure you
          initialize transfer_error with train=False, otherwise
          it would train classifier twice without any necessity.
        feature_selector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        bestdetector : Functor
          Given a list of error values it has to return a boolean that
          signals whether the latest error value is the total minimum.
        stopping_criterion : Functor
          Given a list of error values it has to return whether the
          criterion is fulfilled.
        train_clf : bool
          Flag whether the classifier in `transfer_error` should be
          trained before computing the error. In general this is
          required, but if the `sensitivity_analyzer` and
          `transfer_error` share and make use of the same classifier it
          can be switched off to save CPU cycles. Default `None` checks
          if sensitivity_analyzer is based on a classifier and doesn't train
          if so.
        update_sensitivity : bool
          If False the sensitivity map is only computed once and reused
          for each iteration. Otherwise the senstitivities are
          recomputed at each selection step.
        """

        # base init first
        FeatureSelection.__init__(self, **kargs)

        self.__sensitivity_analyzer = sensitivity_analyzer
        """Sensitivity analyzer used to call at each step."""

        self.__transfer_error = transfer_error
        """Compute transfer error for each feature set."""

        self.__feature_selector = feature_selector
        """Functor which takes care about removing some features."""

        self.__stopping_criterion = stopping_criterion

        self.__bestdetector = bestdetector

        if train_clf is None:
            self.__train_clf = isinstance(sensitivity_analyzer,
                                          Sensitivity)
        else:
            self.__train_clf = train_clf
            """Flag whether training classifier is required."""

        self.__update_sensitivity = update_sensitivity
        """Flag whether sensitivity map is recomputed for each step."""

        # force clf training when sensitivities are not updated as otherwise
        # shared classifiers are not retrained
        if not self.__update_sensitivity \
               and isinstance(self.__transfer_error, ClassifierError) \
               and not self.__train_clf:
            if __debug__:
                debug("RFEC", "Forcing training of classifier since " +
                      "sensitivities aren't updated at each step")
            self.__train_clf = True


    def _call(self, dataset, testdataset):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        dataset : Dataset
          used to compute sensitivity maps and train a classifier
          to determine the transfer error
        testdataset : Dataset
          used to test the trained classifer to determine the
          transfer error

        Returns a tuple of two new datasets with the feature subset of
        `dataset` that had the lowest transfer error of all tested
        sets until the stopping criterion was reached. The first
        dataset is the feature subset of the training data and the
        second the selection of the test dataset.
        """
        errors = []
        """Computed error for each tested features set."""

        ca = self.ca
        ca.nfeatures = []
        """Number of features at each step. Since it is not used by the
        algorithm it is stored directly in the conditional attribute"""

        ca.history = arange(dataset.nfeatures)
        """Store the last step # when the feature was still present
        """

        ca.sensitivities = []

        stop = False
        """Flag when RFE should be stopped."""

        results = None
        """Will hold the best feature set ever."""

        wdataset = dataset
        """Operate on working dataset initially identical."""

        wtestdataset = testdataset
        """Same feature selection has to be performs on test dataset as well.
        This will hold the current testdataset."""

        step = 0
        """Counter how many selection step where done."""

        orig_feature_ids = arange(dataset.nfeatures)
        """List of feature Ids as per original dataset remaining at any given
        step"""

        sensitivity = None
        """Contains the latest sensitivity map."""

        result_selected_ids = orig_feature_ids
        """Resultant ids of selected features. Since the best is not
        necessarily is the last - we better keep this one around. By
        default -- all features are there"""
        selected_ids = result_selected_ids

        while wdataset.nfeatures > 0:

            if __debug__:
                debug('RFEC',
                      "Step %d: nfeatures=%d" % (step, wdataset.nfeatures))

            # mark the features which are present at this step
            # if it brings anyb mentionable computational burden in the future,
            # only mark on removed features at each step
            ca.history[orig_feature_ids] = step

            # Compute sensitivity map
            if self.__update_sensitivity or sensitivity == None:
                sensitivity = self.__sensitivity_analyzer(wdataset)
                if len(sensitivity) > 1:
                    raise ValueError(
                            "RFE cannot handle multiple sensitivities at once. "
                            "'%s' returned %i sensitivities."
                            % (self.__sensitivity_analyzer.__class__.__name__,
                               len(sensitivity)))

            if ca.is_enabled("sensitivities"):
                ca.sensitivities.append(sensitivity)

            # do not retrain clf if not necessary
            if self.__train_clf:
                error = self.__transfer_error(wtestdataset, wdataset)
            else:
                error = self.__transfer_error(wtestdataset, None)

            # Record the error
            errors.append(error)

            # Check if it is time to stop and if we got
            # the best result
            stop = self.__stopping_criterion(errors)
            isthebest = self.__bestdetector(errors)

            nfeatures = wdataset.nfeatures

            if ca.is_enabled("nfeatures"):
                ca.nfeatures.append(wdataset.nfeatures)

            # store result
            if isthebest:
                results = (wdataset, wtestdataset)
                result_selected_ids = orig_feature_ids

            if __debug__:
                debug('RFEC',
                      "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " %
                      (step, nfeatures, error, isthebest, stop))

            # stop if it is time to finish
            if nfeatures == 1 or stop:
                break

            # Select features to preserve
            selected_ids = self.__feature_selector(sensitivity)

            if __debug__:
                debug('RFEC_',
                      "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" %
                      (sensitivity, len(selected_ids), selected_ids))


            # Create a dataset only with selected features
            wdataset = wdataset[:, selected_ids]

            # select corresponding sensitivity values if they are not
            # recomputed
            if not self.__update_sensitivity:
                sensitivity = sensitivity[selected_ids]

            # need to update the test dataset as well
            # XXX why should it ever become None?
            # yoh: because we can have __transfer_error computed
            #      using wdataset. See xia-generalization estimate
            #      in lightsvm. Or for god's sake leave-one-out
            #      on a wdataset
            # TODO: document these cases in this class
            if not testdataset is None:
                wtestdataset = wtestdataset[:, selected_ids]

            step += 1

            # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids
            selected_ids.sort()
            if self.ca.is_enabled("history") \
                   or self.ca.is_enabled('selected_ids'):
                orig_feature_ids = orig_feature_ids[selected_ids]


            if hasattr(self.__transfer_error, "clf"):
                self.__transfer_error.clf.untrain()
        # charge conditional attributes
        self.ca.errors = errors
        self.ca.selected_ids = result_selected_ids

        # best dataset ever is returned
        return results
示例#24
0
class LinearSVMWeights(Sensitivity):
    """`SensitivityAnalyzer` for the LIBSVM implementation of a linear SVM.
    """

    _ATTRIBUTE_COLLECTIONS = ['params']

    # XXX TODO: should become just as sa may be?
    biases = ConditionalAttribute(enabled=True,
                                  doc="Offsets of separating hyper-planes")

    split_weights = Parameter(
        False,
        allowedtype='bool',
        doc="If binary classification either to sum SVs per each "
        "class separately.  Note: be careful with interpretation"
        " of the values")

    def __init__(self, clf, **kwargs):
        """Initialize the analyzer with the classifier it shall use.

        Parameters
        ----------
        clf : LinearSVM
          classifier to use. Only classifiers sub-classed from
          `LinearSVM` may be used.
        """
        # init base classes first
        Sensitivity.__init__(self, clf, **kwargs)

    def _call(self, dataset, callables=[]):
        # local bindings
        clf = self.clf
        model = clf.model

        # Labels for sensitivities to be returned
        sens_labels = None

        if clf.__is_regression__:
            nr_class = None
            svm_labels = None  # shouldn't bother to provide "targets" for regressions
        else:
            nr_class = model.nr_class
            svm_labels = model.labels

        # No need to warn since now we by default we do not do
        # anything evil and provide labels -- so it is up for a user
        # to decide either he wants to do something silly
        #if nr_class != 2:
        #    warning("You are estimating sensitivity for SVM %s trained on %d" %
        #            (str(clf), nr_class) +
        #            " classes. Make sure that it is what you intended to do" )

        svcoef = np.matrix(model.get_sv_coef())
        svs = np.matrix(model.get_sv())
        rhos = np.asarray(model.get_rho())

        self.ca.biases = rhos
        if self.params.split_weights:
            if nr_class != 2:
                raise NotImplementedError, \
                      "Cannot compute per-class weights for" \
                      " non-binary classification task"
            # libsvm might have different idea on the ordering
            # of labels, so we would need to map them back explicitely
            ds_labels = list(dataset.sa[
                clf.params.targets_attr].unique)  # labels in the dataset
            senses = [None for i in ds_labels]
            # first label is given positive value
            for i, (c, l) in enumerate([(svcoef > 0, lambda x: x),
                                        (svcoef < 0, lambda x: x * -1)]):
                # convert to array, and just take the meaningful dimension
                c_ = c.A[0]
                # NOTE svm_labels are numerical; ds_labels are literal
                senses[ds_labels.index(
                            clf._attrmap.to_literal(svm_labels[i]))] = \
                                (l(svcoef[:, c_] * svs[c_, :])).A[0]
            weights = np.array(senses)
            sens_labels = svm_labels
        else:
            # XXX yoh: .mean() is effectively
            # averages across "sensitivities" of all paired classifiers (I
            # think). See more info on this topic in svm.py on how sv_coefs
            # are stored
            #
            # First multiply SV coefficients with the actual SVs to get
            # weighted impact of SVs on decision, then for each feature
            # take mean across SVs to get a single weight value
            # per feature
            if nr_class is None or nr_class <= 2:
                # as simple as this
                weights = (svcoef * svs).A
                # and only in case of classification
                if nr_class:
                    # ??? First label seems corresponds to positive
                    sens_labels = [tuple(svm_labels[::-1])]
            else:
                # we need to compose correctly per each pair of classifiers.
                # See docstring for get_sv_coef for more details on internal
                # structure of bloody storage

                # total # of pairs
                npairs = nr_class * (nr_class - 1) / 2
                # # of SVs in each class
                NSVs_perclass = model.get_n_sv()
                # indices where each class starts in each row of SVs
                # name is after similar variable in libsvm internals
                nz_start = np.cumsum([0] + NSVs_perclass[:-1])
                nz_end = nz_start + NSVs_perclass
                # reserve storage
                weights = np.zeros((npairs, svs.shape[1]))
                ipair = 0  # index of the pair
                """
                // classifier (i,j): coefficients with
				// i are in sv_coef[j-1][nz_start[i]...],
				// j are in sv_coef[i][nz_start[j]...]
                """
                sens_labels = []
                for i in xrange(nr_class):
                    for j in xrange(i + 1, nr_class):
                        weights[ipair, :] = np.asarray(
                            svcoef[j - 1, nz_start[i]:nz_end[i]] *
                            svs[nz_start[i]:nz_end[i]] +
                            svcoef[i, nz_start[j]:nz_end[j]] *
                            svs[nz_start[j]:nz_end[j]])
                        # ??? First label corresponds to positive
                        # that is why [j], [i]
                        sens_labels += [(svm_labels[j], svm_labels[i])]
                        ipair += 1  # go to the next pair
                assert (ipair == npairs)

        if __debug__ and 'SVM' in debug.active:
            if nr_class:
                nsvs = model.get_n_sv()
            else:
                nsvs = model.get_total_n_sv()

            debug('SVM',
                  "Extracting weights for %s-class SVM: #SVs=%s, " % \
                  (nr_class, nsvs) + \
                  " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \
                  (svcoef.shape, svs.shape, rhos) + \
                  " Result: min=%f max=%f" % (np.min(weights), np.max(weights)))

        ds_kwargs = {}
        if nr_class:  # for classification only
            # and we should have prepared the labels
            assert (sens_labels is not None)

            if len(clf._attrmap):
                if isinstance(sens_labels[0], tuple):
                    sens_labels = asobjarray(sens_labels)
                sens_labels = clf._attrmap.to_literal(sens_labels,
                                                      recurse=True)

            # NOTE: `weights` is already and always 2D
            ds_kwargs = dict(sa={clf.params.targets_attr: sens_labels})

        weights_ds = Dataset(weights, **ds_kwargs)
        return weights_ds

    _customizeDocInherit = True
示例#25
0
class CombinedFeatureSelection(FeatureSelection):
    """Meta feature selection utilizing several embedded selection methods.

    Each embedded feature selection method is computed individually. Afterwards
    all feature sets are combined by either taking the union or intersection of
    all sets.

    The individual feature sets of all embedded methods are optionally avialable
    from the `selections_ids` conditional attribute.
    """
    selections_ids = ConditionalAttribute(
        doc="List of feature id sets for each performed method.")

    def __init__(self, feature_selections, combiner, **kwargs):
        """
        Parameters
        ----------
        feature_selections : list
          FeatureSelection instances to run. Order is not important.
        combiner : 'union', 'intersection'
          which method to be used to combine the feature selection set of
          all computed methods.
        """
        FeatureSelection.__init__(self, **kwargs)

        self.__feature_selections = feature_selections
        self.__combiner = combiner

    def untrain(self):
        if __debug__:
            debug("FS_", "Untraining combined FS: %s" % self)
        for fs in self.__feature_selections:
            fs.untrain()

    def _call(self, dataset, testdataset=None):
        """Really run it.
        """
        # to hold the union
        selected_ids = None
        # to hold the individuals
        self.ca.selections_ids = []

        for fs in self.__feature_selections:
            # we need the feature ids that were selection by each method,
            # so enable them temporarily
            fs.ca.change_temporarily(enable_ca=["selected_ids"], other=self)

            # compute feature selection, but ignore return datasets
            fs(dataset, testdataset)

            # retrieve feature ids and determined union of all selections
            if selected_ids == None:
                selected_ids = set(fs.ca.selected_ids)
            else:
                if self.__combiner == 'union':
                    selected_ids.update(fs.ca.selected_ids)
                elif self.__combiner == 'intersection':
                    selected_ids.intersection_update(fs.ca.selected_ids)
                else:
                    raise ValueError, "Unknown combiner '%s'" % self.__combiner

            # store individual set in state
            self.ca.selections_ids.append(fs.ca.selected_ids)

            # restore ca to previous settings
            fs.ca.reset_changed_temporarily()

        # finally apply feature set union selection to original datasets
        selected_ids = sorted(list(selected_ids))

        # take care of optional second dataset
        td_sel = None
        if not testdataset is None:
            td_sel = testdataset[:, self.ca.selected_ids]

        # and main dataset
        d_sel = dataset[:, selected_ids]

        # finally store ids in state
        self.ca.selected_ids = selected_ids

        return (d_sel, td_sel)

    feature_selections = property(fget=lambda self: self.__feature_selections,
                                  doc="List of `FeatureSelections`")
    combiner = property(fget=lambda self: self.__combiner,
                        doc="Selection set combination method.")
示例#26
0
class Classifier(ClassWithCollections):
    """Abstract classifier class to be inherited by all classifiers
    """

    # Kept separate from doc to don't pollute help(clf), especially if
    # we including help for the parent class
    _DEV__doc__ = """
    Required behavior:

    For every classifier is has to be possible to be instantiated without
    having to specify the training pattern.

    Repeated calls to the train() method with different training data have to
    result in a valid classifier, trained for the particular dataset.

    It must be possible to specify all classifier parameters as keyword
    arguments to the constructor.

    Recommended behavior:

    Derived classifiers should provide access to *estimates* -- i.e. that
    information that is finally used to determine the predicted class label.

    Michael: Maybe it works well if each classifier provides a 'estimates'
             state member. This variable is a list as long as and in same order
             as Dataset.uniquetargets (training data). Each item in the list
             corresponds to the likelyhood of a sample to belong to the
             respective class. However the semantics might differ between
             classifiers, e.g. kNN would probably store distances to class-
             neighbors, where PLR would store the raw function value of the
             logistic function. So in the case of kNN low is predictive and for
             PLR high is predictive. Don't know if there is the need to unify
             that.

             As the storage and/or computation of this information might be
             demanding its collection should be switchable and off be default.

    Nomenclature
     * predictions  : result of the last call to .predict()
     * estimates : might be different from predictions if a classifier's predict()
                   makes a decision based on some internal value such as
                   probability or a distance.
    """
    # Dict that contains the parameters of a classifier.
    # This shall provide an interface to plug generic parameter optimizer
    # on all classifiers (e.g. grid- or line-search optimizer)
    # A dictionary is used because Michael thinks that access by name is nicer.
    # Additionally Michael thinks ATM that additional information might be
    # necessary in some situations (e.g. reasonably predefined parameter range,
    # minimal iteration stepsize, ...), therefore the value to each key should
    # also be a dict or we should use mvpa.misc.param.Parameter'...

    trained_targets = ConditionalAttribute(
        enabled=True, doc="Set of unique targets it has been trained on")

    trained_nsamples = ConditionalAttribute(
        enabled=True, doc="Number of samples it has been trained on")

    trained_dataset = ConditionalAttribute(
        enabled=False, doc="The dataset it has been trained on")

    training_confusion = ConditionalAttribute(
        enabled=False, doc="Confusion matrix of learning performance")

    predictions = ConditionalAttribute(enabled=True,
                                       doc="Most recent set of predictions")

    estimates = ConditionalAttribute(
        enabled=True,
        doc="Internal classifier estimates the most recent " +
        "predictions are based on")

    training_time = ConditionalAttribute(
        enabled=True, doc="Time (in seconds) which took classifier to train")

    predicting_time = ConditionalAttribute(
        enabled=True, doc="Time (in seconds) which took classifier to predict")

    feature_ids = ConditionalAttribute(
        enabled=False,
        doc="Feature IDS which were used for the actual training.")

    __tags__ = []
    """Describes some specifics about the classifier -- is that it is
    doing regression for instance...."""

    targets_attr = Parameter(
        'targets',
        allowedtype='bool',  # ro=True,
        doc="""What samples attribute to use as targets.""",
        index=999)

    # TODO: make it available only for actually retrainable classifiers
    retrainable = Parameter(
        False,
        allowedtype='bool',
        doc="""Either to enable retraining for 'retrainable' classifier.""",
        index=1002)

    def __init__(self, **kwargs):
        ClassWithCollections.__init__(self, **kwargs)

        # XXX
        # the place to map literal to numerical labels (and back)
        # this needs to be in the base class, since some classifiers also
        # have this nasty 'regression' mode, and the code in this class
        # needs to deal with converting the regression output into discrete
        # labels
        # however, preferably the mapping should be kept in the respective
        # low-level implementations that need it
        self._attrmap = AttributeMap()

        self.__trainednfeatures = None
        """Stores number of features for which classifier was trained.
        If None -- it wasn't trained at all"""

        self._set_retrainable(self.params.retrainable, force=True)

        # deprecate
        #self.__trainedidhash = None
        #"""Stores id of the dataset on which it was trained to signal
        #in trained() if it was trained already on the same dataset"""

    @property
    def __summary_class__(self):
        if 'regression' in self.__tags__:
            return RegressionStatistics
        else:
            return ConfusionMatrix

    @property
    def __is_regression__(self):
        return 'regression' in self.__tags__

    def __str__(self):
        if __debug__ and 'CLF_' in debug.active:
            return "%s / %s" % (repr(self), super(Classifier, self).__str__())
        else:
            return repr(self)

    def __repr__(self, prefixes=[]):
        return super(Classifier, self).__repr__(prefixes=prefixes)

    def _pretrain(self, dataset):
        """Functionality prior to training
        """
        # So we reset all conditional attributes and may be free up some memory
        # explicitly
        params = self.params
        if not params.retrainable:
            self.untrain()
        else:
            # just reset the ca, do not untrain
            self.ca.reset()
            if not self.__changedData_isset:
                self.__reset_changed_data()
                _changedData = self._changedData
                __idhashes = self.__idhashes
                __invalidatedChangedData = self.__invalidatedChangedData

                # if we don't know what was changed we need to figure
                # them out
                if __debug__:
                    debug('CLF_', "IDHashes are %s" % (__idhashes))

                # Look at the data if any was changed
                for key, data_ in (('traindata', dataset.samples),
                                   ('targets',
                                    dataset.sa[params.targets_attr].value)):
                    _changedData[key] = self.__was_data_changed(key, data_)
                    # if those idhashes were invalidated by retraining
                    # we need to adjust _changedData accordingly
                    if __invalidatedChangedData.get(key, False):
                        if __debug__ and not _changedData[key]:
                            debug(
                                'CLF_', 'Found that idhash for %s was '
                                'invalidated by retraining' % key)
                        _changedData[key] = True

                # Look at the parameters
                for col in self._paramscols:
                    changedParams = self._collections[col].which_set()
                    if len(changedParams):
                        _changedData[col] = changedParams

                self.__invalidatedChangedData = {}  # reset it on training

                if __debug__:
                    debug('CLF_',
                          "Obtained _changedData is %s" % (self._changedData))

    def _posttrain(self, dataset):
        """Functionality post training

        For instance -- computing confusion matrix.

        Parameters
        ----------
        dataset : Dataset
          Data which was used for training
        """
        ca = self.ca
        if ca.is_enabled('trained_targets'):
            ca.trained_targets = dataset.sa[self.params.targets_attr].unique

        ca.trained_dataset = dataset
        ca.trained_nsamples = dataset.nsamples

        # needs to be assigned first since below we use predict
        self.__trainednfeatures = dataset.nfeatures

        if __debug__ and 'CHECK_TRAINED' in debug.active:
            self.__trainedidhash = dataset.idhash

        if self.ca.is_enabled('training_confusion') and \
               not self.ca.is_set('training_confusion'):
            # we should not store predictions for training data,
            # it is confusing imho (yoh)
            self.ca.change_temporarily(disable_ca=["predictions"])
            if self.params.retrainable:
                # we would need to recheck if data is the same,
                # XXX think if there is a way to make this all
                # efficient. For now, probably, retrainable
                # classifiers have no chance but not to use
                # training_confusion... sad
                self.__changedData_isset = False
            predictions = self.predict(dataset)
            self.ca.reset_changed_temporarily()
            self.ca.training_confusion = self.__summary_class__(
                targets=dataset.sa[self.params.targets_attr].value,
                predictions=predictions)

        if self.ca.is_enabled('feature_ids'):
            self.ca.feature_ids = self._get_feature_ids()

    ##REF: Name was automagically refactored
    def _get_feature_ids(self):
        """Virtual method to return feature_ids used while training

        Is not intended to be called anywhere but from _posttrain,
        thus classifier is assumed to be trained at this point
        """
        # By default all features are used
        return range(self.__trainednfeatures)

    def summary(self):
        """Providing summary over the classifier"""

        s = "Classifier %s" % self
        ca = self.ca
        ca_enabled = ca.enabled

        if self.trained:
            s += "\n trained"
            if ca.is_set('training_time'):
                s += ' in %.3g sec' % ca.training_time
            s += ' on data with'
            if ca.is_set('trained_targets'):
                s += ' targets:%s' % list(ca.trained_targets)

            nsamples, nchunks = None, None
            if ca.is_set('trained_nsamples'):
                nsamples = ca.trained_nsamples
            if ca.is_set('trained_dataset'):
                td = ca.trained_dataset
                nsamples, nchunks = td.nsamples, len(td.sa['chunks'].unique)
            if nsamples is not None:
                s += ' #samples:%d' % nsamples
            if nchunks is not None:
                s += ' #chunks:%d' % nchunks

            s += " #features:%d" % self.__trainednfeatures
            if ca.is_set('feature_ids'):
                s += ", used #features:%d" % len(ca.feature_ids)
            if ca.is_set('training_confusion'):
                s += ", training error:%.3g" % ca.training_confusion.error
        else:
            s += "\n not yet trained"

        if len(ca_enabled):
            s += "\n enabled ca:%s" % ', '.join(
                [str(ca[x]) for x in ca_enabled])
        return s

    def clone(self):
        """Create full copy of the classifier.

        It might require classifier to be untrained first due to
        present SWIG bindings.

        TODO: think about proper re-implementation, without enrollment of deepcopy
        """
        if __debug__:
            debug("CLF", "Cloning %s#%s" % (self, id(self)))
        try:
            return deepcopy(self)
        except:
            self.untrain()
            return deepcopy(self)

    def _train(self, dataset):
        """Function to be actually overridden in derived classes
        """
        raise NotImplementedError

    def train(self, dataset):
        """Train classifier on a dataset

        Shouldn't be overridden in subclasses unless explicitly needed
        to do so
        """
        if dataset.nfeatures == 0 or dataset.nsamples == 0:
            raise DegenerateInputError, \
                  "Cannot train classifier on degenerate data %s" % dataset
        if __debug__:
            debug("CLF",
                  "Training classifier %(clf)s on dataset %(dataset)s",
                  msgargs={
                      'clf': self,
                      'dataset': dataset
                  })

        self._pretrain(dataset)

        # remember the time when started training
        t0 = time.time()

        if dataset.nfeatures > 0:

            result = self._train(dataset)
        else:
            warning("Trying to train on dataset with no features present")
            if __debug__:
                debug("CLF",
                      "No features present for training, no actual training " \
                      "is called")
            result = None

        self.ca.training_time = time.time() - t0
        self._posttrain(dataset)
        return result

    def _prepredict(self, dataset):
        """Functionality prior prediction
        """
        if not ('notrain2predict' in self.__tags__):
            # check if classifier was trained if that is needed
            if not self.trained:
                raise ValueError, \
                      "Classifier %s wasn't yet trained, therefore can't " \
                      "predict" % self
            nfeatures = dataset.nfeatures  #data.shape[1]
            # check if number of features is the same as in the data
            # it was trained on
            if nfeatures != self.__trainednfeatures:
                raise ValueError, \
                      "Classifier %s was trained on data with %d features, " % \
                      (self, self.__trainednfeatures) + \
                      "thus can't predict for %d features" % nfeatures

        if self.params.retrainable:
            if not self.__changedData_isset:
                self.__reset_changed_data()
                _changedData = self._changedData
                data = np.asanyarray(dataset.samples)
                _changedData['testdata'] = \
                                        self.__was_data_changed('testdata', data)
                if __debug__:
                    debug(
                        'CLF_', "prepredict: Obtained _changedData is %s" %
                        (_changedData))

    def _postpredict(self, dataset, result):
        """Functionality after prediction is computed
        """
        self.ca.predictions = result
        if self.params.retrainable:
            self.__changedData_isset = False

    def _predict(self, dataset):
        """Actual prediction
        """
        raise NotImplementedError

    @accepts_samples_as_dataset
    def predict(self, dataset):
        """Predict classifier on data

        Shouldn't be overridden in subclasses unless explicitly needed
        to do so. Also subclasses trying to call super class's predict
        should call _predict if within _predict instead of predict()
        since otherwise it would loop
        """
        ## ??? yoh: changed to asany from as without exhaustive check
        data = np.asanyarray(dataset.samples)
        if __debug__:
            debug("CLF",
                  "Predicting classifier %(clf)s on ds %(dataset)s",
                  msgargs={
                      'clf': self,
                      'dataset': dataset
                  })

        # remember the time when started computing predictions
        t0 = time.time()

        ca = self.ca
        # to assure that those are reset (could be set due to testing
        # post-training)
        ca.reset(['estimates', 'predictions'])

        self._prepredict(dataset)

        if self.__trainednfeatures > 0 \
               or 'notrain2predict' in self.__tags__:
            result = self._predict(dataset)
        else:
            warning(
                "Trying to predict using classifier trained on no features")
            if __debug__:
                debug("CLF",
                      "No features were present for training, prediction is " \
                      "bogus")
            result = [None] * data.shape[0]

        ca.predicting_time = time.time() - t0

        # with labels mapping in-place, we also need to go back to the
        # literal labels
        if self._attrmap:
            try:
                result = self._attrmap.to_literal(result)
            except KeyError, e:
                raise FailedToPredictError, \
                      "Failed to convert predictions from numeric into " \
                      "literals: %s" % e

        self._postpredict(dataset, result)
        return result
示例#27
0
class kNN(Classifier):
    """
    k-Nearest-Neighbour classifier.

    This is a simple classifier that bases its decision on the distances
    between the training dataset samples and the test sample(s). Distances
    are computed using a customizable distance function. A certain number
    (`k`)of nearest neighbors is selected based on the smallest distances
    and the labels of this neighboring samples are fed into a voting
    function to determine the labels of the test sample.

    Training a kNN classifier is extremely quick, as no actuall training
    is performed as the training dataset is simply stored in the
    classifier. All computations are done during classifier prediction.

    Notes
    -----
    If enabled, kNN stores the votes per class in the 'values' state after
    calling predict().

    """

    distances = ConditionalAttribute(enabled=False,
                                     doc="Distances computed for each sample")

    __tags__ = ['knn', 'non-linear', 'binary', 'multiclass', 'notrain2predict']

    def __init__(self,
                 k=2,
                 dfx=squared_euclidean_distance,
                 voting='weighted',
                 **kwargs):
        """
        Parameters
        ----------
        k : unsigned integer
          Number of nearest neighbours to be used for voting.
        dfx : functor
          Function to compute the distances between training and test samples.
          Default: squared euclidean distance
        voting : str
          Voting method used to derive predictions from the nearest neighbors.
          Possible values are 'majority' (simple majority of classes
          determines vote) and 'weighted' (votes are weighted according to the
          relative frequencies of each class in the training data).
        **kwargs
          Additonal arguments are passed to the base class.
        """

        # init base class first
        Classifier.__init__(self, **kwargs)

        self.__k = k
        self.__dfx = dfx
        self.__voting = voting
        self.__data = None

    def __repr__(self, prefixes=[]):
        """Representation of the object
        """
        return super(kNN, self).__repr__([
            "k=%d" % self.__k,
            "dfx=%s" % self.__dfx,
            "voting=%s" % repr(self.__voting)
        ] + prefixes)

    def __str__(self):
        return "%s\n data: %s" % \
            (Classifier.__str__(self), indent_doc(self.__data))

    def _train(self, data):
        """Train the classifier.

        For kNN it is degenerate -- just stores the data.
        """
        self.__data = data
        if __debug__:
            if str(data.samples.dtype).startswith('uint') \
                or str(data.samples.dtype).startswith('int'):
                warning("kNN: input data is in integers. " + \
                        "Overflow on arithmetic operations might result in"+\
                        " errors. Please convert dataset's samples into" +\
                        " floating datatype if any error is reported.")
        self.__weights = None

        # create dictionary with an item for each condition
        uniquelabels = data.sa[self.params.targets_attr].unique
        self.__votes_init = dict(zip(uniquelabels, [0] * len(uniquelabels)))

    @accepts_dataset_as_samples
    def _predict(self, data):
        """Predict the class labels for the provided data.

        Returns a list of class labels (one for each data sample).
        """
        # make sure we're talking about arrays
        data = np.asarray(data)

        # checks only in debug mode
        if __debug__:
            if not data.ndim == 2:
                raise ValueError, "Data array must be two-dimensional."

            if not data.shape[1] == self.__data.nfeatures:
                raise ValueError, "Length of data samples (features) does " \
                                  "not match the classifier."

        # compute the distance matrix between training and test data with
        # distances stored row-wise, ie. distances between test sample [0]
        # and all training samples will end up in row 0
        dists = self.__dfx(self.__data.samples, data).T
        if self.ca.is_enabled('distances'):
            # TODO: theoretically we should have used deepcopy for sa
            #       here
            self.ca.distances = Dataset(dists, fa=self.__data.sa.copy())

        # determine the k nearest neighbors per test sample
        knns = dists.argsort(axis=1)[:, :self.__k]

        # predicted class labels will go here
        predicted = []

        if self.__voting == 'majority':
            vfx = self.get_majority_vote
        elif self.__voting == 'weighted':
            vfx = self.get_weighted_vote
        else:
            raise ValueError, "kNN told to perform unknown voting '%s'." \
                  % self.__voting

        # perform voting
        results = [vfx(knn) for knn in knns]

        # extract predictions
        predicted = [r[0] for r in results]

        # store the predictions in the state. Relies on State._setitem to do
        # nothing if the relevant state member is not enabled
        self.ca.predictions = predicted
        self.ca.estimates = np.array([r[1] for r in results])

        return predicted

    ##REF: Name was automagically refactored
    def get_majority_vote(self, knn_ids):
        """Simple voting by choosing the majority of class neighbors.
        """
        # local bindings
        _data = self.__data

        targets_sa_name = self.params.targets_attr
        targets_sa = _data.sa[targets_sa_name]

        labels = targets_sa.value
        uniquelabels = targets_sa.unique

        # number of occerences for each unique class in kNNs
        votes = self.__votes_init.copy()
        for nn in knn_ids:
            votes[labels[nn]] += 1

        # find the class with most votes
        # return votes as well to store them in the state
        if _dict_has_key:
            # approx 5% faster implementation than below
            maxvotes = max(votes.iteritems(), key=lambda x: x[1])[0]
        else:
            # no key keyword for max in elderly versions
            maxvotes = max([(v, k) for k, v in votes.iteritems()])[1]

        return maxvotes, \
                [votes[ul] for ul in uniquelabels] # transform into lists

    ##REF: Name was automagically refactored
    def get_weighted_vote(self, knn_ids):
        """Vote with classes weighted by the number of samples per class.
        """
        # local bindings
        _data = self.__data
        targets_sa_name = self.params.targets_attr
        targets_sa = _data.sa[targets_sa_name]

        uniquelabels = targets_sa.unique

        # Lazy evaluation
        if self.__weights is None:
            #
            # It seemed to Yarik that this has to be evaluated just once per
            # training dataset.
            #
            self.__labels = labels = targets_sa.value
            Nlabels = len(labels)
            Nuniquelabels = len(uniquelabels)

            # TODO: To get proper speed up for the next line only,
            #       histogram should be computed
            #       via sorting + counting "same" elements while reducing.
            #       Guaranteed complexity is NlogN whenever now it is N^2
            # compute the relative proportion of samples belonging to each
            # class (do it in one loop to improve speed and reduce readability
            self.__weights = \
                [ 1.0 - ((labels == label).sum() / Nlabels) \
                    for label in uniquelabels ]
            self.__weights = dict(zip(uniquelabels, self.__weights))

        labels = self.__labels
        # number of occerences for each unique class in kNNs
        votes = self.__votes_init.copy()
        for nn in knn_ids:
            votes[labels[nn]] += 1

        # weight votes
        votes = [self.__weights[ul] * votes[ul] for ul in uniquelabels]

        # find the class with most votes
        # return votes as well to store them in the state
        return uniquelabels[np.asarray(votes).argmax()], \
               votes

    def untrain(self):
        """Reset trained state"""
        self.__data = None
        super(kNN, self).untrain()
示例#28
0
class GPR(Classifier):
    """Gaussian Process Regression (GPR).

    """

    predicted_variances = ConditionalAttribute(
        enabled=False, doc="Variance per each predicted value")

    log_marginal_likelihood = ConditionalAttribute(
        enabled=False, doc="Log Marginal Likelihood")

    log_marginal_likelihood_gradient = ConditionalAttribute(
        enabled=False, doc="Log Marginal Likelihood Gradient")

    __tags__ = ['gpr', 'regression', 'retrainable']

    # NOTE XXX Parameters of the classifier. Values available as
    # clf.parameter or clf.params.parameter, or as
    # clf.params['parameter'] (as the full Parameter object)
    #
    # __doc__ and __repr__ for class is conviniently adjusted to
    # reflect values of those params

    # Kernel machines/classifiers should be refactored also to behave
    # the same and define kernel parameter appropriately... TODO, but SVMs
    # already kinda do it nicely ;-)

    sigma_noise = Parameter(
        0.001,
        allowedtype='float',
        min=1e-10,
        doc="the standard deviation of the gaussian noise.")

    # XXX For now I don't introduce kernel parameter since yet to unify
    # kernel machines
    #kernel = Parameter(None, allowedtype='Kernel',
    #    doc="Kernel object defining the covariance between instances. "
    #        "(Defaults to KernelSquaredExponential if None in arguments)")

    lm = Parameter(None,
                   min=0.0,
                   allowedtype='None or float',
                   doc="""The regularization term lambda.
        Increase this when the kernel matrix is not positive definite. If None,
        some regularization will be provided upon necessity""")

    def __init__(self, kernel=None, **kwargs):
        """Initialize a GPR regression analysis.

        Parameters
        ----------
        kernel : Kernel
          a kernel object defining the covariance between instances.
          (Defaults to SquaredExponentialKernel if None in arguments)
        """
        # init base class first
        Classifier.__init__(self, **kwargs)

        # It does not make sense to calculate a confusion matrix for a GPR
        # XXX it does ;) it will be a RegressionStatistics actually ;-)
        # So if someone desires -- let him have it
        # self.ca.enable('training_confusion', False)

        # set kernel:
        if kernel is None:
            kernel = SquaredExponentialKernel()
            debug(
                "GPR",
                "No kernel was provided, falling back to default: %s" % kernel)
        self.__kernel = kernel

        # append proper clf_internal depending on the kernel
        # TODO: add "__tags__" to kernels since the check
        #       below does not scale
        if isinstance(kernel, GeneralizedLinearKernel) or \
           isinstance(kernel, LinearKernel):
            self.__tags__ += ['linear']
        else:
            self.__tags__ += ['non-linear']
            if externals.exists('openopt'):
                self.__tags__ += ['has_sensitivity']

        # No need to initialize conditional attributes. Unless they got set
        # they would raise an exception self.predicted_variances =
        # None self.log_marginal_likelihood = None
        self._init_internals()
        pass

    def _init_internals(self):
        """Reset some internal variables to None.

        To be used in constructor and untrain()
        """
        self._train_fv = None
        self._labels = None
        self._km_train_train = None
        self._train_labels = None
        self._alpha = None
        self._L = None
        self._LL = None
        # XXX EO: useful for model selection but not working in general
        # self.__kernel.reset()
        pass

    def __repr__(self):
        """String summary of the object
        """
        return super(GPR,
                     self).__repr__(prefixes=['kernel=%s' % self.__kernel])

    def compute_log_marginal_likelihood(self):
        """
        Compute log marginal likelihood using self.train_fv and self.targets.
        """
        if __debug__:
            debug("GPR", "Computing log_marginal_likelihood")
        self.ca.log_marginal_likelihood = \
                                 -0.5*Ndot(self._train_labels, self._alpha) - \
                                  Nlog(self._L.diagonal()).sum() - \
                                  self._km_train_train.shape[0] * _halflog2pi
        return self.ca.log_marginal_likelihood

    def compute_gradient_log_marginal_likelihood(self):
        """Compute gradient of the log marginal likelihood. This
        version use a more compact formula provided by Williams and
        Rasmussen book.
        """
        # XXX EO: check whether the precomputed self.alpha self.Kinv
        # are actually the ones corresponding to the hyperparameters
        # used to compute this gradient!
        # YYY EO: currently this is verified outside gpr.py but it is
        # not an efficient solution.
        # XXX EO: Do some memoizing since it could happen that some
        # hyperparameters are kept constant by user request, so we
        # don't need (somtimes) to recompute the corresponding
        # gradient again. COULD THIS BE TAKEN INTO ACCOUNT BY THE
        # NEW CACHED KERNEL INFRASTRUCTURE?

        # self.Kinv = np.linalg.inv(self._C)
        # Faster:
        Kinv = SLcho_solve(self._LL, np.eye(self._L.shape[0]))

        alphalphaT = np.dot(self._alpha[:, None], self._alpha[None, :])
        tmp = alphalphaT - Kinv
        # Pass tmp to __kernel and let it compute its gradient terms.
        # This scales up to huge number of hyperparameters:
        grad_LML_hypers = self.__kernel.compute_lml_gradient(
            tmp, self._train_fv)
        grad_K_sigma_n = 2.0 * self.params.sigma_noise * np.eye(tmp.shape[0])
        # Add the term related to sigma_noise:
        # grad_LML_sigma_n = 0.5 * np.trace(np.dot(tmp,grad_K_sigma_n))
        # Faster formula: tr(AB) = (A*B.T).sum()
        grad_LML_sigma_n = 0.5 * (tmp * (grad_K_sigma_n).T).sum()
        lml_gradient = np.hstack([grad_LML_sigma_n, grad_LML_hypers])
        self.log_marginal_likelihood_gradient = lml_gradient
        return lml_gradient

    def compute_gradient_log_marginal_likelihood_logscale(self):
        """Compute gradient of the log marginal likelihood when
        hyperparameters are in logscale. This version use a more
        compact formula provided by Williams and Rasmussen book.
        """
        # Kinv = np.linalg.inv(self._C)
        # Faster:
        Kinv = SLcho_solve(self._LL, np.eye(self._L.shape[0]))
        alphalphaT = np.dot(self._alpha[:, None], self._alpha[None, :])
        tmp = alphalphaT - Kinv
        grad_LML_log_hypers = \
            self.__kernel.compute_lml_gradient_logscale(tmp, self._train_fv)
        grad_K_log_sigma_n = 2.0 * self.params.sigma_noise**2 * np.eye(
            Kinv.shape[0])
        # Add the term related to sigma_noise:
        # grad_LML_log_sigma_n = 0.5 * np.trace(np.dot(tmp, grad_K_log_sigma_n))
        # Faster formula: tr(AB) = (A * B.T).sum()
        grad_LML_log_sigma_n = 0.5 * (tmp * (grad_K_log_sigma_n).T).sum()
        lml_gradient = np.hstack([grad_LML_log_sigma_n, grad_LML_log_hypers])
        self.log_marginal_likelihood_gradient = lml_gradient
        return lml_gradient

    ##REF: Name was automagically refactored
    def get_sensitivity_analyzer(self, flavor='auto', **kwargs):
        """Returns a sensitivity analyzer for GPR.

        Parameters
        ----------
        flavor : str
          What sensitivity to provide. Valid values are
          'linear', 'model_select', 'auto'.
          In case of 'auto' selects 'linear' for linear kernel
          and 'model_select' for the rest. 'linear' corresponds to
          GPRLinearWeights and 'model_select' to GRPWeights
        """
        # XXX The following two lines does not work since
        # self.__kernel is instance of LinearKernel and not
        # just LinearKernel. How to fix?
        # YYY yoh is not sure what is the problem... LinearKernel is actually
        #     kernel.LinearKernel so everything shoudl be ok
        if flavor == 'auto':
            flavor = ('model_select', 'linear')\
                     [int(isinstance(self.__kernel, GeneralizedLinearKernel)
                          or
                          isinstance(self.__kernel, LinearKernel))]
            if __debug__:
                debug("GPR", "Returning '%s' sensitivity analyzer" % flavor)

        # Return proper sensitivity
        if flavor == 'linear':
            return GPRLinearWeights(self, **kwargs)
        elif flavor == 'model_select':
            # sanity check
            if not ('has_sensitivity' in self.__tags__):
                raise ValueError, \
                      "model_select flavor is not available probably " \
                      "due to not available 'openopt' module"
            return GPRWeights(self, **kwargs)
        else:
            raise ValueError, "Flavor %s is not recognized" % flavor

    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """

        # local bindings for faster lookup
        params = self.params
        retrainable = params.retrainable
        if retrainable:
            newkernel = False
            newL = False
            _changedData = self._changedData

        self._train_fv = train_fv = data.samples
        # GRP relies on numerical labels
        # yoh: yeah -- GPR now is purely regression so no conversion
        #      is necessary
        train_labels = data.sa[params.targets_attr].value
        self._train_labels = train_labels

        if not retrainable or _changedData['traindata'] \
               or _changedData.get('kernel_params', False):
            if __debug__:
                debug("GPR", "Computing train train kernel matrix")
            self.__kernel.compute(train_fv)
            self._km_train_train = km_train_train = asarray(self.__kernel)
            newkernel = True
            if retrainable:
                self._km_train_test = None  # reset to facilitate recomputation
        else:
            if __debug__:
                debug(
                    "GPR", "Not recomputing kernel since retrainable and "
                    "nothing has changed")
            km_train_train = self._km_train_train  # reuse

        if not retrainable or newkernel or _changedData['params']:
            if __debug__:
                debug("GPR", "Computing L. sigma_noise=%g" \
                             % params.sigma_noise)
            # XXX it seems that we do not need binding to object, but may be
            # commented out code would return?
            self._C = km_train_train + \
                  params.sigma_noise ** 2 * \
                  np.identity(km_train_train.shape[0], 'd')
            # The following decomposition could raise
            # np.linalg.linalg.LinAlgError because of numerical
            # reasons, due to the too rapid decay of 'self._C'
            # eigenvalues. In that case we try adding a small constant
            # to self._C, e.g. epsilon=1.0e-20. It should be a form of
            # Tikhonov regularization. This is equivalent to adding
            # little white gaussian noise to data.
            #
            # XXX EO: how to choose epsilon?
            #
            # Cholesky decomposition is provided by three different
            # NumPy/SciPy routines (fastest first):
            # 1) self._LL = scipy.linalg.cho_factor(self._C, lower=True)
            #    self._L = L = np.tril(self._LL[0])
            # 2) self._L = scipy.linalg.cholesky(self._C, lower=True)
            # 3) self._L = numpy.linalg.cholesky(self._C)
            # Even though 1 is the fastest we choose 2 since 1 does
            # not return a clean lower-triangular matrix (see docstring).

            # PBS: I just made it so the KernelMatrix is regularized
            # all the time.  I figured that if ever you were going to
            # use regularization, you would want to set it yourself
            # and use the same value for all folds of your data.
            # YOH: Ideally so, but in real "use cases" some might have no
            #      clue, also our unittests (actually clfs_examples) might
            #      fail without any good reason.  So lets return a magic with
            #      an option to forbid any regularization (if lm is None)
            try:
                # apply regularization
                lm, C = params.lm, self._C
                if lm is not None:
                    epsilon = lm * np.eye(C.shape[0])
                    self._L = SLcholesky(C + epsilon, lower=True)
                else:
                    # do 10 attempts to raise each time by 10
                    self._L = _SLcholesky_autoreg(C, nsteps=None, lower=True)
                self._LL = (self._L, True)
            except SLAError:
                raise SLAError("Kernel matrix is not positive, definite. "
                               "Try increasing the lm parameter.")
                pass
            newL = True
        else:
            if __debug__:
                debug(
                    "GPR", "Not computing L since kernel, data and params "
                    "stayed the same")

        # XXX we leave _alpha being recomputed, although we could check
        #   if newL or _changedData['targets']
        #
        if __debug__:
            debug("GPR", "Computing alpha")
        # L = self._L                 # reuse
        # self._alpha = NLAsolve(L.transpose(),
        #                              NLAsolve(L, train_labels))
        # Faster:
        self._alpha = SLcho_solve(self._LL, train_labels)

        # compute only if the state is enabled
        if self.ca.is_enabled('log_marginal_likelihood'):
            self.compute_log_marginal_likelihood()
            pass

        if retrainable:
            # we must assign it only if it is retrainable
            self.ca.retrained = not newkernel or not newL

        if __debug__:
            debug("GPR", "Done training")

        pass

    @accepts_dataset_as_samples
    def _predict(self, data):
        """
        Predict the output for the provided data.
        """
        retrainable = self.params.retrainable
        ca = self.ca

        if not retrainable or self._changedData['testdata'] \
               or self._km_train_test is None:
            if __debug__:
                debug('GPR', "Computing train test kernel matrix")
            self.__kernel.compute(self._train_fv, data)
            km_train_test = asarray(self.__kernel)
            if retrainable:
                self._km_train_test = km_train_test
                ca.repredicted = False
        else:
            if __debug__:
                debug('GPR', "Not recomputing train test kernel matrix")
            km_train_test = self._km_train_test
            ca.repredicted = True

        predictions = Ndot(km_train_test.transpose(), self._alpha)

        if ca.is_enabled('predicted_variances'):
            # do computation only if conditional attribute was enabled
            if not retrainable or self._km_test_test is None \
                   or self._changedData['testdata']:
                if __debug__:
                    debug('GPR', "Computing test test kernel matrix")
                self.__kernel.compute(data)
                km_test_test = asarray(self.__kernel)
                if retrainable:
                    self._km_test_test = km_test_test
            else:
                if __debug__:
                    debug('GPR', "Not recomputing test test kernel matrix")
                km_test_test = self._km_test_test

            if __debug__:
                debug("GPR", "Computing predicted variances")
            L = self._L
            # v = NLAsolve(L, km_train_test)
            # Faster:
            piv = np.arange(L.shape[0])
            v = SL.lu_solve((L.T, piv), km_train_test, trans=1)
            # self.predicted_variances = \
            #     Ndiag(km_test_test - Ndot(v.T, v)) \
            #     + self.sigma_noise**2
            # Faster formula: np.diag(Ndot(v.T, v)) = (v**2).sum(0):
            ca.predicted_variances = Ndiag(km_test_test) - (v ** 2).sum(0) \
                                       + self.params.sigma_noise ** 2
            pass

        if __debug__:
            debug("GPR", "Done predicting")
        ca.estimates = predictions
        return predictions

    ##REF: Name was automagically refactored
    def _set_retrainable(self, value, force=False):
        """Internal function : need to set _km_test_test
        """
        super(GPR, self)._set_retrainable(value, force)
        if force or (value and value != self.params.retrainable):
            self._km_test_test = None

    def untrain(self):
        super(GPR, self).untrain()
        # XXX might need to take special care for retrainable. later
        self._init_internals()
        pass

    def set_hyperparameters(self, hyperparameter):
        """
        Set hyperparameters' values.

        Note that 'hyperparameter' is a sequence so the order of its
        values is important. First value must be sigma_noise, then
        other kernel's hyperparameters values follow in the exact
        order the kernel expect them to be.
        """
        if hyperparameter[0] < self.params['sigma_noise'].min:
            raise InvalidHyperparameterError()
        self.params.sigma_noise = hyperparameter[0]
        if hyperparameter.size > 1:
            self.__kernel.set_hyperparameters(hyperparameter[1:])
            pass
        return

    kernel = property(fget=lambda self: self.__kernel)
    pass
示例#29
0
class GLM(FeaturewiseDatasetMeasure):
    """General linear model (GLM).

    Regressors can be defined in a design matrix and a linear fit of the data
    is computed univariately (i.e. indepently for each feature). This measure
    can report 'raw' parameter estimates (i.e. beta weights) of the linear
    model, as well as standardized parameters (z-stat) using an ordinary
    least squares (aka fixed-effects) approach to estimate the parameter
    estimate.

    The measure is reported in a (nfeatures x nregressors)-shaped array.
    """

    pe = ConditionalAttribute(enabled=False,
        doc="Parameter estimates (nfeatures x nparameters).")

    zstat = ConditionalAttribute(enabled=False,
        doc="Standardized parameter estimates (nfeatures x nparameters).")

    def __init__(self, design, voi='pe', **kwargs):
        """
        Parameters
        ----------
        design : array (nsamples x nregressors)
          GLM design matrix.
        voi : {'pe', 'zstat'}
          Variable of interest that should be reported as feature-wise
          measure. 'beta' are the parameter estimates and 'zstat' returns
          standardized parameter estimates.
        """
        FeaturewiseDatasetMeasure.__init__(self, **kwargs)
        # store the design matrix as a such (no copying if already array)
        self._design = np.asmatrix(design)

        # what should be computed ('variable of interest')
        if not voi in ['pe', 'zstat']:
            raise ValueError, \
                  "Unknown variable of interest '%s'" % str(voi)
        self._voi = voi

        # will store the precomputed Moore-Penrose pseudo-inverse of the
        # design matrix (lazy calculation)
        self._inv_design = None
        # also store the inverse of the inner product for beta variance
        # estimation
        self._inv_ip = None


    def _call(self, dataset):
        # just for the beauty of it
        X = self._design

        # precompute transformation is not yet done
        if self._inv_design is None:
            self._inv_ip = (X.T * X).I
            self._inv_design = self._inv_ip * X.T

        # get parameter estimations for all features at once
        # (betas x features)
        betas = self._inv_design * dataset.samples

        # charge state
        self.ca.pe = pe = betas.T.A

        # if betas and no z-stats are desired return them right away
        if not self._voi == 'pe' or self.ca.is_enabled('zstat'):
            # compute residuals
            residuals = X * betas
            residuals -= dataset.samples

            # estimates of the parameter variance and compute zstats
            # assumption of mean(E) == 0 and equal variance
            # XXX next lines ignore off-diagonal elements and hence covariance
            # between regressors. The humble being writing these lines asks the
            # god of statistics for forgives, because it knows not what it does
            diag_ip = np.diag(self._inv_ip)
            # (features x betas)
            beta_vars = np.array([ r.var() * diag_ip for r in residuals.T ])
            # (parameter x feature)
            zstat = pe / np.sqrt(beta_vars)

            # charge state
            self.ca.zstat = zstat

        if self._voi == 'pe':
            # return as (beta x feature)
            result = Dataset(pe.T)
        elif self._voi == 'zstat':
            # return as (zstat x feature)
            result = Dataset(zstat.T)
        else:
            # we shall never get to this point
            raise ValueError, \
                  "Unknown variable of interest '%s'" % str(self._voi)
        result.sa['regressor'] = np.arange(len(result))
        return result
示例#30
0
class SensitivityBasedFeatureSelection(FeatureSelection):
    """Feature elimination.

    A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a certain
    dataset. These sensitivity maps are in turn used to discard unimportant
    features.
    """

    sensitivity = ConditionalAttribute(enabled=False)

    def __init__(self,
                 sensitivity_analyzer,
                 feature_selector=FractionTailSelector(0.05),
                 **kwargs):
        """Initialize feature selection

        Parameters
        ----------
        sensitivity_analyzer : FeaturewiseDatasetMeasure
          sensitivity analyzer to come up with sensitivity
        feature_selector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.

        """

        # base init first
        FeatureSelection.__init__(self, **kwargs)

        self.__sensitivity_analyzer = sensitivity_analyzer
        """Sensitivity analyzer to use once"""

        self.__feature_selector = feature_selector
        """Functor which takes care about removing some features."""

    def untrain(self):
        if __debug__:
            debug("FS_", "Untraining sensitivity-based FS: %s" % self)
        self.__sensitivity_analyzer.untrain()

    def _call(self, dataset, testdataset=None):
        """Select the most important features

        Parameters
        ----------
        dataset : Dataset
          used to compute sensitivity maps
        testdataset : Dataset
          optional dataset to select features on

        Returns a tuple of two new datasets with selected feature
        subset of `dataset`.
        """

        sensitivity = self.__sensitivity_analyzer(dataset)
        """Compute the sensitivity map."""

        self.ca.sensitivity = sensitivity

        # Select features to preserve
        selected_ids = self.__feature_selector(sensitivity)

        if __debug__:
            debug(
                "FS_", "Sensitivity: %s Selected ids: %s" %
                (sensitivity, selected_ids))

        # Create a dataset only with selected features
        wdataset = dataset[:, selected_ids]

        if not testdataset is None:
            wtestdataset = testdataset[:, selected_ids]
        else:
            wtestdataset = None

        # Differ from the order in RFE when actually error reported is for
        results = (wdataset, wtestdataset)

        # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids
        selected_ids.sort()
        self.ca.selected_ids = selected_ids

        # dataset with selected features is returned
        return results

    # make it accessible from outside
    sensitivity_analyzer = property(
        fget=lambda self: self.__sensitivity_analyzer,
        doc="Measure which was used to do selection")
示例#31
0
class IFS(FeatureSelection):
    """Incremental feature search.

    A scalar `DatasetMeasure` is computed multiple times on variations of a
    certain dataset. These measures are in turn used to incrementally select
    important features. Starting with an empty feature set the dataset measure
    is first computed for each single feature. A number of features is selected
    based on the resulting data measure map (using an `ElementSelector`).

    Next the dataset measure is computed again using each feature in addition
    to the already selected feature set. Again the `ElementSelector` is used to
    select more features.

    For each feature selection the transfer error on some testdatset is
    computed. This procedure is repeated until a given `StoppingCriterion`
    is reached.
    """

    errors = ConditionalAttribute()

    def __init__(self,
                 data_measure,
                 transfer_error,
                 bestdetector=BestDetector(),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector()),
                 feature_selector=FixedNElementTailSelector(1,
                                                            tail='upper',
                                                            mode='select'),
                 **kwargs):
        """Initialize incremental feature search

        Parameters
        ----------
        data_measure : DatasetMeasure
          Computed for each candidate feature selection. The measure has
          to compute a scalar value.
        transfer_error : TransferError
          Compute against a test dataset for each incremental feature
          set.
        bestdetector : Functor
          Given a list of error values it has to return a boolean that
          signals whether the latest error value is the total minimum.
        stopping_criterion : Functor
          Given a list of error values it has to return whether the
          criterion is fulfilled.
        """
        # bases init first
        FeatureSelection.__init__(self, **kwargs)

        self.__data_measure = data_measure
        self.__transfer_error = transfer_error
        self.__feature_selector = feature_selector
        self.__bestdetector = bestdetector
        self.__stopping_criterion = stopping_criterion

    def _call(self, dataset, testdataset):
        """Proceed and select the features recursively eliminating less
        important ones.

        Parameters
        ----------
        dataset : Dataset
          used to select features and train classifiers to determine the
          transfer error.
        testdataset : Dataset
          used to test the trained classifer on a certain feature set
          to determine the transfer error.

        Returns
        -------
        A tuple with the dataset containing the feature subset of
        `dataset` that had the lowest transfer error of all tested sets until
        the stopping criterion was reached. The tuple also contains a dataset
        with the corrsponding features from the `testdataset`.
        """
        errors = []
        """Computed error for each tested features set."""

        # feature candidate are all features in the pattern object
        candidates = range(dataset.nfeatures)

        # initially empty list of selected features
        selected = []

        # results in here please
        results = None

        # as long as there are candidates left
        # the loop will most likely get broken earlier if the stopping
        # criterion is reached
        while len(candidates):
            # measures for all candidates
            measures = []

            # for all possible candidates
            for i, candidate in enumerate(candidates):
                if __debug__:
                    debug('IFSC', "Tested %i" % i, cr=True)

                # take the new candidate and all already selected features
                # select a new temporay feature subset from the dataset
                # XXX assume MappedDataset and issue plain=True ??
                tmp_dataset = \
                        dataset[:, selected + [candidate]]

                # compute data measure on this feature set
                measures.append(self.__data_measure(tmp_dataset))

            measures = [np.asscalar(m) for m in measures]
            # Select promissing feature candidates (staging)
            # IDs are only applicable to the current set of feature candidates
            tmp_staging_ids = self.__feature_selector(measures)

            # translate into real candidate ids
            staging_ids = [candidates[i] for i in tmp_staging_ids]

            # mark them as selected and remove from candidates
            selected += staging_ids
            for i in staging_ids:
                candidates.remove(i)

            # compute transfer error for the new set
            # XXX assume MappedDataset and issue plain=True ??
            error = self.__transfer_error(testdataset[:, selected],
                                          dataset[:, selected])
            errors.append(error)

            # Check if it is time to stop and if we got
            # the best result
            stop = self.__stopping_criterion(errors)
            isthebest = self.__bestdetector(errors)

            if __debug__:
                debug('IFSC',
                      "nselected %i; error: %.4f " \
                      "best/stop=%d/%d\n" \
                      % (len(selected), errors[-1], isthebest, stop),
                      cr=True, lf=True)

            if isthebest:
                # do copy to survive later selections
                results = copy(selected)

            # leave the loop when the criterion is reached
            if stop:
                break

        # charge state
        self.ca.errors = errors

        # best dataset ever is returned
        return dataset[:, results], testdataset[:, results]