Exemplo n.º 1
0
    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """
        targets = data.sa[self.get_space()].value[:, np.newaxis]
        enet_kwargs = {}
        if self.__max_steps is not None:
            enet_kwargs['max.steps'] = self.__max_steps

        try:
            self.__trained_model = trained_model = \
                r.enet(data.samples,
                       targets,
                       self.__lm,
                       normalize=self.__normalize,
                       intercept=self.__intercept,
                       trace=self.__trace,
                       **enet_kwargs)
        except RRuntimeError as e:
            raise FailedToTrainError("Failed to predict on %s using %s. Exceptions was: %s" \
                  % (data, self, e))

        # find the step with the lowest Cp (risk)
        # it is often the last step if you set a max_steps
        # must first convert dictionary to array
#         Cp_vals = np.asarray([trained_model['Cp'][str(x)]
#                              for x in range(len(trained_model['Cp']))])
#         self.__lowest_Cp_step = Cp_vals.argmin()

# set the weights to the last step
        beta_pure = np.asanyarray(Rrx2(trained_model, 'beta.pure'))
        self.__beta_pure_shape = beta_pure.shape
        self.__weights = np.zeros(data.nfeatures, dtype=beta_pure.dtype)
        ind = np.asanyarray(Rrx2(trained_model, 'allset')) - 1
        self.__weights[ind] = beta_pure[-1, :]
Exemplo n.º 2
0
    def _train(self, dataset):
        """Train the classifier using `data` (`Dataset`).
        """
        # process targets based on the model family
        targets = dataset.sa[self.get_space()].value
        if self.params.family == 'gaussian':
            # do nothing, just save the targets as a list
            #targets = targets.tolist()
            self._utargets = None
        elif self.params.family == 'multinomial':
            # turn lables into list of range values starting at 1
            #targets = _label2indlist(dataset.targets,
            #                        dataset.uniquetargets)
            targets_unique = dataset.sa[self.get_space()].unique
            targets = _label2oneofm(targets, targets_unique)

            # save some properties of the data/classification
            self._utargets = targets_unique.copy()

        # process the pmax
        if self.params.pmax is None:
            # set it to the num features
            pmax = dataset.nfeatures
        else:
            # use the value
            pmax = self.params.pmax

        try:
            self.__trained_model = trained_model = \
                r.glmnet(dataset.samples,
                         targets,
                         family=self.params.family,
                         alpha=self.params.alpha,
                         nlambda=self.params.nlambda,
                         standardize=self.params.standardize,
                         thresh=self.params.thresh,
                         pmax=pmax,
                         maxit=self.params.maxit,
                         type=self.params.model_type)
        except RRuntimeError as e:
            raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call r.glmnet()." \
                  % (self, dataset, e))

        self.__last_lambda = last_lambda = \
                             np.asanyarray(Rrx2(trained_model, 'lambda'))[-1]

        # set the weights to the last step
        weights = r.coef(trained_model, s=last_lambda)
        if self.params.family == 'multinomial':
            self.__weights = np.hstack([np.array(r['as.matrix'](weight))[1:]
                                        for weight in weights])
        elif self.params.family == 'gaussian':
            self.__weights = np.array(r['as.matrix'](weights))[1:, 0]
        else:
            raise NotImplementedError("Somehow managed to get here with family %s." % \
                  (self.params.family,))
Exemplo n.º 3
0
    def _train(self, dataset):
        """Train the skl learner using `dataset` (`Dataset`).
        """
        targets_sa = dataset.sa[self.get_space()]
        targets = targets_sa.value
        if not 'regression' in self.__tags__:
            targets = self._attrmap.to_numeric(targets)

        try:
            self._R_model = r[self._learner](
                dataset.samples,
                targets,
                **self._kwargs)
        except RRuntimeError as e:
            raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call to fit()." \
                  % (self, dataset, e))
Exemplo n.º 4
0
    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """
        targets = data.sa[self.get_space()].value[:, np.newaxis]
        # some non-Python friendly R-lars arguments
        lars_kwargs = {'use.Gram': self.__use_Gram}
        if self.__max_steps is not None:
            lars_kwargs['max.steps'] = self.__max_steps

        trained_model = r.lars(data.samples,
                               targets,
                               type=self.__type,
                               normalize=self.__normalize,
                               intercept=self.__intercept,
                               trace=self.__trace,
                               **lars_kwargs)
        #import pydb
        #pydb.debugger()
        # find the step with the lowest Cp (risk)
        # it is often the last step if you set a max_steps
        # must first convert dictionary to array
        Cp_vals = None
        try:
            Cp_vals = np.asanyarray(Rrx2(trained_model, 'Cp'))
        except TypeError as e:
            raise FailedToTrainError("Failed to train %s on %s. Got '%s' while trying to access " \
                  "trained model %s" % (self, data, e, trained_model))

        if Cp_vals is None:
            # if there were no any -- just choose 0th
            lowest_Cp_step = 0
        elif np.isnan(Cp_vals[0]):
            # sometimes may come back nan, so just pick the last one
            lowest_Cp_step = len(Cp_vals) - 1
        else:
            # determine the lowest
            lowest_Cp_step = Cp_vals.argmin()

        self.__lowest_Cp_step = lowest_Cp_step
        # set the weights to the lowest Cp step
        self.__weights = np.asanyarray(Rrx2(trained_model,
                                            'beta'))[lowest_Cp_step]

        self.__trained_model = trained_model  # bind to an instance
Exemplo n.º 5
0
    def _train(self, dataset):
        """Train the skl learner using `dataset` (`Dataset`).
        """
        targets_sa = dataset.sa[self.get_space()]
        targets = targets_sa.value
        # Some sanity checking so some classifiers such as LDA do not
        # puke meaningless exceptions
        if 'lda' in self.__tags__:
            if not dataset.nsamples > len(targets_sa.unique):
                raise DegenerateInputError(
                    "LDA requires # of samples exceeding # of classes")

        # we better map into numeric labels if it is not a regression
        if not 'regression' in self.__tags__:
            targets = self._attrmap.to_numeric(targets)

        try:
            # train underlying learner
            self._skl_learner.fit(dataset.samples, targets)
        except (ValueError, np.linalg.LinAlgError) as e:
            raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call to fit()." \
                  % (self, dataset, e))
Exemplo n.º 6
0
    def _train(self, dataset):
        """Train SVM
        """
        super(SVM, self)._train(dataset)
        targets_sa_name = self.get_space()  # name of targets sa
        targets_sa = dataset.sa[targets_sa_name]  # actual targets sa

        # libsvm needs doubles
        src = _data2ls(dataset)

        # libsvm cannot handle literal labels
        labels = self._attrmap.to_numeric(targets_sa.value).tolist()

        svmprob = _svm.SVMProblem(labels, src)

        # Translate few params
        TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'}
        args = []
        for paramname, param in list(self.params.items()) \
                + list(self.kernel_params.items()):
            if paramname in TRANSLATEDICT:
                argname = TRANSLATEDICT[paramname]
            elif paramname in _svm.SVMParameter.default_parameters:
                argname = paramname
            else:
                if __debug__:
                    debug(
                        "SVM_", "Skipping parameter %s since it is not known "
                        "to libsvm" % paramname)
                continue
            args.append((argname, param.value))

        # ??? All those parameters should be fetched if present from
        # **kwargs and create appropriate parameters within .params or
        # .kernel_params
        libsvm_param = _svm.SVMParameter(
            kernel_type=self.params.kernel.as_raw_ls(),  # Just an integer ID
            svm_type=self._svm_type,
            **dict(args))
        """Store SVM parameters in libSVM compatible format."""

        if 'C' in self.params:  # svm_type in [_svm.svmc.C_SVC]:
            Cs = self._get_cvec(dataset)
            if len(Cs) > 1:
                C0 = abs(Cs[0])
                scale = 1.0 / (C0)  #*np.sqrt(C0))
                # so we got 1 C per label
                uls = self._attrmap.to_numeric(targets_sa.unique)
                if len(Cs) != len(uls):
                    raise ValueError(
                        "SVM was parameterized with %d Cs but there are %d "
                        "labels in the dataset" %
                        (len(Cs), len(targets_sa.unique)))
                weight = [c * scale for c in Cs]
                # All 3 need to be set to take an effect
                libsvm_param._set_parameter('weight', weight)
                libsvm_param._set_parameter('nr_weight', len(weight))
                libsvm_param._set_parameter('weight_label', uls)
            libsvm_param._set_parameter('C', Cs[0])

        try:
            self.__model = _svm.SVMModel(svmprob, libsvm_param)
        except Exception as e:
            raise FailedToTrainError(str(e))
Exemplo n.º 7
0
    def _train(self, dataset):
        """Train SVM
        """
        super(SVM, self)._train(dataset)
        # XXX watchout
        # self.untrain()
        newkernel, newsvm = False, False
        # local bindings for faster lookup
        params = self.params
        retrainable = self.params.retrainable

        targets_sa_name = self.get_space()  # name of targets sa
        targets_sa = dataset.sa[targets_sa_name]  # actual targets sa

        if retrainable:
            _changedData = self._changedData

        # LABELS
        ul = None
        self.__traindataset = dataset

        # OK -- we have to map labels since
        #  binary ones expect -1/+1
        #  Multiclass expect labels starting with 0, otherwise they puke
        #   when ran from ipython... yikes
        if __debug__:
            debug("SG_", "Creating labels instance")

        if self.__is_regression__:
            labels_ = np.asarray(targets_sa.value, dtype='double')
        else:
            ul = targets_sa.unique
            # ul.sort()

            if len(ul) == 2:
                # assure that we have -1/+1
                _labels_dict = {ul[0]: -1.0, ul[1]: +1.0}
            elif len(ul) < 2:
                raise FailedToTrainError(
                    "We do not have 1-class SVM brought into SG yet")
            else:
                # can't use plain enumerate since we need them swapped
                _labels_dict = dict([(u, i) for i, u in enumerate(ul)])

            # Create SG-customized attrmap to assure -1 / +1 if necessary
            self._attrmap = AttributeMap(_labels_dict, mapnumeric=True)

            if __debug__:
                debug("SG__", "Mapping labels using dict %s" % _labels_dict)
            labels_ = self._attrmap.to_numeric(targets_sa.value).astype(float)

        labels = shogun.Features.Labels(labels_)
        _setdebug(labels, 'Labels')

        # KERNEL

        # XXX cruel fix for now... whole retraining business needs to
        # be rethought
        if retrainable:
            _changedData['kernel_params'] = _changedData.get(
                'kernel_params', False)

        # TODO: big RF to move non-kernel classifiers away
        if 'kernel-based' in self.__tags__ and (not retrainable
                                                or _changedData['traindata'] or
                                                _changedData['kernel_params']):
            # If needed compute or just collect arguments for SVM and for
            # the kernel

            if retrainable and __debug__:
                if _changedData['traindata']:
                    debug(
                        "SG",
                        "Re-Creating kernel since training data has changed")

                if _changedData['kernel_params']:
                    debug(
                        "SG",
                        "Re-Creating kernel since params %s has changed" %
                        _changedData['kernel_params'])

            k = self.params.kernel
            k.compute(dataset)
            self.__kernel = kernel = k.as_raw_sg()

            newkernel = True
            self.kernel_params.reset()  # mark them as not-changed
            #_setdebug(kernel, 'Kernels')

            #self.__condition_kernel(kernel)
            if retrainable:
                if __debug__:
                    debug("SG_", "Resetting test kernel for retrainable SVM")
                self.__kernel_test = None

        # TODO -- handle _changedData['params'] correctly, ie without recreating
        # whole SVM
        Cs = None
        if not retrainable or self.__svm is None or _changedData['params']:
            # SVM
            if 'C' in self.params:
                Cs = self._get_cvec(dataset)

                # XXX do not jump over the head and leave it up to the user
                #     ie do not rescale automagically by the number of samples
                #if len(Cs) == 2 and not ('regression' in self.__tags__) and len(ul) == 2:
                #    # we were given two Cs
                #    if np.max(C) < 0 and np.min(C) < 0:
                #        # and both are requested to be 'scaled' TODO :
                #        # provide proper 'features' to the parameters,
                #        # so we could specify explicitely if to scale
                #        # them by the number of samples here
                #        nl = [np.sum(labels_ == _labels_dict[l]) for l in ul]
                #        ratio = np.sqrt(float(nl[1]) / nl[0])
                #        #ratio = (float(nl[1]) / nl[0])
                #        Cs[0] *= ratio
                #        Cs[1] /= ratio
                #        if __debug__:
                #            debug("SG_", "Rescaled Cs to %s to accomodate the "
                #                  "difference in number of training samples" %
                #                  Cs)

            # Choose appropriate implementation
            svm_impl_class = self.__get_implementation(ul)

            if __debug__:
                debug("SG",
                      "Creating SVM instance of %s" % repr(svm_impl_class))

            if self._svm_impl in ['libsvr', 'svrlight']:
                # for regressions constructor a bit different
                self.__svm = svm_impl_class(Cs[0], self.params.tube_epsilon,
                                            self.__kernel, labels)
                # we need to set epsilon explicitly
                self.__svm.set_epsilon(self.params.epsilon)
            elif self._svm_impl in ['krr']:
                self.__svm = svm_impl_class(self.params.tau, self.__kernel,
                                            labels)
            elif 'kernel-based' in self.__tags__:
                self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
                self.__svm.set_epsilon(self.params.epsilon)
            else:
                traindata_sg = _tosg(dataset.samples)
                self.__svm = svm_impl_class(Cs[0], traindata_sg, labels)
                self.__svm.set_epsilon(self.params.epsilon)

            # To stay compatible with versions across API changes in sg 1.0.0
            self.__svm_apply = externals.versions['shogun'] >= '1' \
                               and self.__svm.apply \
                               or  self.__svm.classify # the last one for old API

            # Set shrinking
            if 'shrinking' in params:
                shrinking = params.shrinking
                if __debug__:
                    debug("SG_", "Setting shrinking to %s" % shrinking)
                self.__svm.set_shrinking_enabled(shrinking)

            if Cs is not None and len(Cs) == 2:
                if __debug__:
                    debug(
                        "SG_",
                        "Since multiple Cs are provided: %s, assign them" % Cs)
                self.__svm.set_C(Cs[0], Cs[1])

            self.params.reset()  # mark them as not-changed
            newsvm = True
            _setdebug(self.__svm, 'SVM')
            # Set optimization parameters
            if 'tube_epsilon' in self.params and \
                   hasattr(self.__svm, 'set_tube_epsilon'):
                self.__svm.set_tube_epsilon(self.params.tube_epsilon)
            self.__svm.parallel.set_num_threads(self.params.num_threads)
        else:
            if __debug__:
                debug("SG_", "SVM instance is not re-created")
            if _changedData['targets']:  # labels were changed
                if __debug__: debug("SG__", "Assigning new labels")
                self.__svm.set_labels(labels)
            if newkernel:  # kernel was replaced
                if __debug__: debug("SG__", "Assigning new kernel")
                self.__svm.set_kernel(self.__kernel)
            assert (_changedData['params'] is False
                    )  # we should never get here

        if retrainable:
            # we must assign it only if it is retrainable
            self.ca.retrained = not newsvm or not newkernel

        # Train
        if __debug__ and 'SG' in debug.active:
            if not self.__is_regression__:
                lstr = " with labels %s" % targets_sa.unique
            else:
                lstr = ""
            debug(
                "SG", "%sTraining %s on data%s" %
                (("", "Re-")[retrainable and self.ca.retrained], self, lstr))

        self.__svm.train()

        if __debug__:
            debug("SG_", "Done training SG_SVM %s" % self)

        # Report on training
        if (__debug__ and 'SG__' in debug.active) or \
           self.ca.is_enabled('training_stats'):
            if __debug__:
                debug("SG_", "Assessing predictions on training data")
            trained_targets = self.__svm_apply().get_labels()

        else:
            trained_targets = None

        if __debug__ and "SG__" in debug.active:
            debug(
                "SG__", "Original labels: %s, Trained labels: %s" %
                (targets_sa.value, trained_targets))

        # Assign training confusion right away here since we are ready
        # to do so.
        # XXX TODO use some other conditional attribute like 'trained_targets' and
        #     use it within base Classifier._posttrain to assign predictions
        #     instead of duplicating code here
        # XXX For now it can be done only for regressions since labels need to
        #     be remapped and that becomes even worse if we use regression
        #     as a classifier so mapping happens upstairs
        if self.__is_regression__ and self.ca.is_enabled('training_stats'):
            self.ca.training_stats = self.__summary_class__(
                targets=targets_sa.value, predictions=trained_targets)
Exemplo n.º 8
0
    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """
        # Set up the environment for fitting the data
        X = data.samples.T
        d = self._attrmap.to_numeric(data.sa[self.get_space()].value)
        if set(d) != set([0, 1]):
            raise ValueError("Regressors for logistic regression should be [0,1]. Got %s" \
                  %(set(d),))

        if self.__reduced != 0:
            # Data have reduced rank
            from scipy.linalg import svd

            # Compensate for reduced rank:
            # Select only the n largest eigenvectors
            U, S, V = svd(X.T)
            if S[0] == 0:
                raise FailedToTrainError(
                    "Data provided to PLR seems to be degenerate -- "
                    "0-th singular value is 0")
            S /= S[0]
            V = np.matrix(V[:, :np.max(np.where(S > self.__reduced)) + 1])
            # Map Data to the subspace spanned by the eigenvectors
            X = (X.T * V).T

        nfeatures, npatterns = X.shape

        # Weighting vector
        w = np.matrix(np.zeros((nfeatures + 1, 1), 'd'))
        # Error for convergence criterion
        dw = np.matrix(np.ones((nfeatures + 1, 1), 'd'))
        # Patterns of interest in the columns
        X = np.matrix( \
                np.concatenate((X, np.ones((1, npatterns), 'd')), 0) \
                )
        p = np.matrix(np.zeros((1, npatterns), 'd'))
        # Matrix implementation of penalty term
        Lambda = self.__lm * np.identity(nfeatures + 1, 'd')
        Lambda[nfeatures, nfeatures] = 0
        # Gradient
        g = np.matrix(np.zeros((nfeatures + 1, 1), 'd'))
        # Fisher information matrix
        H = np.matrix(np.identity(nfeatures + 1, 'd'))

        # Optimize
        k = 0
        while np.sum(np.ravel(dw.A**2)) > self.__criterion:
            p[:, :] = self.__f(w.T * X)
            g[:, :] = X * (d - p).T - Lambda * w
            H[:, :] = X * np.diag(p.A1 * (1 - p.A1)) * X.T + Lambda
            dw[:, :] = H.I * g
            w += dw
            k += 1
            if k > self.__maxiter:
                raise ConvergenceError("More than %d Iterations without convergence" % \
                      (self.__maxiter))

        if __debug__:
            debug("PLR", \
                  "PLR converged after %d steps. Error: %g" % \
                  (k, np.sum(np.ravel(dw.A ** 2))))

        if self.__reduced:
            # We have computed in rank reduced space ->
            # Project to original space
            self.w = V * w[:-1]
            self.bias = w[-1]
        else:
            self.w = w[:-1]
            self.bias = w[-1]