예제 #1
0
    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=False)
        if hasattr(self, 'n_features_in_'):
            try:
                num_features = _daal_num_features(X)
            except TypeError:
                num_features = _num_samples(X)
            if num_features != self.n_features_in_:
                raise ValueError(
                    (f'X has {num_features} features, '
                     f'but RandomForestClassifier is expecting '
                     f'{self.n_features_in_} features as input'))

        _patching_status = PatchingConditionsChain(
            "sklearn.ensemble.RandomForestClassifier.predict_proba")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (daal_check_version((2021, 'P', 400)),
                "oneDAL version is lower than 2021.4.")])
        if hasattr(self, 'n_outputs_'):
            _dal_ready = _patching_status.and_conditions([
                (self.n_outputs_ == 1,
                    f"Number of outputs ({self.n_outputs_}) is not 1.")])
        _patching_status.write_log()

        if not _dal_ready:
            return super(RandomForestClassifier, self).predict_proba(X)
        X = check_array(X, dtype=[np.float64, np.float32])
        check_is_fitted(self)
        if sklearn_check_version('0.23'):
            self._check_n_features(X, reset=False)
        return _daal_predict_proba(self, X)
예제 #2
0
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_
        if sklearn_check_version('0.22'):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        # convert model to estimators
        params = {
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'random_state': None,
        }
        if not sklearn_check_version('1.0'):
            params['min_impurity_split'] = self.min_impurity_split
        est = DecisionTreeClassifier(**params)

        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_in_, np.array([1],
                                                             dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        return estimators_
예제 #3
0
 def __init__(
     self,
     alpha=1.0,
     fit_intercept=True,
     normalize="deprecated" if sklearn_check_version('1.0') else False,
     precompute=False,
     copy_X=True,
     max_iter=1000,
     tol=1e-4,
     warm_start=False,
     positive=False,
     random_state=None,
     selection='cyclic',
 ):
     super().__init__(
         alpha=alpha,
         l1_ratio=1.0,
         fit_intercept=fit_intercept,
         normalize=normalize,
         precompute=precompute,
         copy_X=copy_X,
         max_iter=max_iter,
         tol=tol,
         warm_start=warm_start,
         positive=positive,
         random_state=random_state,
         selection=selection,
     )
예제 #4
0
def patch_sklearn(name=None, verbose=True, global_patch=False):
    if not sklearn_check_version('0.22'):
        raise NotImplementedError(
            "Intel(R) Extension for Scikit-learn* patches apply "
            "for scikit-learn >= 0.22 only ...")

    if global_patch:
        from sklearnex.glob.dispatcher import patch_sklearn_global
        patch_sklearn_global(name, verbose)

    from daal4py.sklearn import patch_sklearn as patch_sklearn_orig

    if _is_new_patching_available():
        for config in ['set_config', 'get_config', 'config_context']:
            patch_sklearn_orig(config,
                               verbose=False,
                               deprecation=False,
                               get_map=get_patch_map)
    if isinstance(name, list):
        for algorithm in name:
            patch_sklearn_orig(algorithm,
                               verbose=False,
                               deprecation=False,
                               get_map=get_patch_map)
    else:
        patch_sklearn_orig(name,
                           verbose=False,
                           deprecation=False,
                           get_map=get_patch_map)

    if verbose and sys.stderr is not None:
        sys.stderr.write("Intel(R) Extension for Scikit-learn* enabled "
                         "(https://github.com/intel/scikit-learn-intelex)\n")
예제 #5
0
def _fit_classifier(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError(
            "sparse multilabel-indicator for y is not supported."
        )
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    _patching_status = PatchingConditionsChain(
        "sklearn.ensemble.RandomForestClassifier.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score,
            "OOB score is only supported starting from 2021.5 version of oneDAL."),
        (self.warm_start is False, "Warm start is not supported."),
        (self.criterion == "gini",
            f"'{self.criterion}' criterion is not supported. "
            "Only 'gini' criterion is supported."),
        (self.ccp_alpha == 0.0,
            f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."),
        (not sp.issparse(X), "X is sparse. Sparse input is not supported.")
    ])

    if _dal_ready:
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)
        X = check_array(X, dtype=[np.float32, np.float64])
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                          " expected. Please change the shape of y to "
                          "(n_samples,), for example using ravel().",
                          DataConversionWarning, stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        _dal_ready = _patching_status.and_conditions([
            (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")])

    _patching_status.write_log()
    if _dal_ready:
        _daal_fit_classifier(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_

        # Decapsulate classes_ attributes
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]
        return self
    return super(RandomForestClassifier, self).fit(
        X, y, sample_weight=sample_weight)
예제 #6
0
def _fit_regressor(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError("sparse multilabel-indicator for y is not supported.")
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    if (sklearn_check_version('1.0') and self.criterion == "mse"):
        warnings.warn(
            "Criterion 'mse' was deprecated in v1.0 and will be "
            "removed in version 1.2. Use `criterion='squared_error'` "
            "which is equivalent.", FutureWarning)

    daal_ready = self.warm_start is False and \
        self.criterion in ["mse", "squared_error"] and self.ccp_alpha == 0.0 and \
        not sp.issparse(X) and self.oob_score is False

    if daal_ready:
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        if self.n_outputs_ != 1:
            daal_ready = False

    if daal_ready:
        logging.info("sklearn.ensemble.RandomForestRegressor."
                     "fit: " + get_patch_message("daal"))
        _daal_fit_regressor(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_
        return self
    logging.info("sklearn.ensemble.RandomForestRegressor."
                 "fit: " + get_patch_message("sklearn"))
    return super(RandomForestRegressor, self).fit(X,
                                                  y,
                                                  sample_weight=sample_weight)
예제 #7
0
    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if hasattr(self, 'n_features_in_'):
            try:
                num_features = _daal_num_features(X)
            except TypeError:
                num_features = _num_samples(X)
            if num_features != self.n_features_in_:
                raise ValueError((f'X has {num_features} features, '
                                  f'but RandomForestClassifier is expecting '
                                  f'{self.n_features_in_} features as input'))
        if not hasattr(self, 'daal_model_') or \
           sp.issparse(X) or self.n_outputs_ != 1 or \
           not daal_check_version((2021, 'P', 400)):
            logging.info("sklearn.ensemble.RandomForestClassifier."
                         "predict_proba: " + get_patch_message("sklearn"))
            return super(RandomForestClassifier, self).predict_proba(X)
        logging.info("sklearn.ensemble.RandomForestClassifier."
                     "predict_proba: " + get_patch_message("daal"))
        X = check_array(X, dtype=[np.float64, np.float32])
        check_is_fitted(self)
        if sklearn_check_version('0.23'):
            self._check_n_features(X, reset=False)
        return _daal_predict_proba(self, X)
예제 #8
0
def _daal4py_predict_lasso(self, X):
    X = make2d(X)
    _fptype = getFPType(self.coef_)

    lasso_palg = daal4py.lasso_regression_prediction(fptype=_fptype,
                                                     method='defaultDense')
    if sklearn_check_version('0.23'):
        if self.n_features_in_ != X.shape[1]:
            raise ValueError(f'X has {X.shape[1]} features, '
                             f'but Lasso is expecting '
                             f'{self.n_features_in_} features as input')
    lasso_res = lasso_palg.compute(X, self.daal_model_)

    res = lasso_res.prediction

    if res.shape[1] == 1 and self.coef_.ndim == 1:
        res = np.ravel(res)
    return res
예제 #9
0
    def predict(self, X):
        """
        Predict class for X.

        The predicted class of an input sample is a vote by the trees in
        the forest, weighted by their probability estimates. That is,
        the predicted class is the one with highest mean probability
        estimate across the trees.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
            The predicted classes.
        """
        _patching_status = PatchingConditionsChain(
            "sklearn.ensemble.RandomForestRegressor.predict")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported.")])
        if hasattr(self, 'n_outputs_'):
            _dal_ready = _patching_status.and_conditions([
                (self.n_outputs_ == 1,
                    f"Number of outputs ({self.n_outputs_}) is not 1.")])

        _patching_status.write_log()
        if not _dal_ready:
            return super(RandomForestRegressor, self).predict(X)

        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=False)
        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        return _daal_predict_regressor(self, X)
예제 #10
0
    def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """

        if sklearn_check_version('1.0'):
            self._check_feature_names(X, reset=False)

        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        _patching_status = PatchingConditionsChain(
            "sklearn.linear_model.ElasticNet.predict")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (good_shape_for_daal,
                "The shape of X does not satisfy oneDAL requirements: "
                "number of features > number of samples.")])
        _patching_status.write_log()

        if not _dal_ready:
            return self._decision_function(X)
        return _daal4py_predict_enet(self, X)
예제 #11
0
def _get_n_samples_bootstrap(n_samples, max_samples):
    if max_samples is None:
        return 1.

    if isinstance(max_samples, numbers.Integral):
        if not (1 <= max_samples <= n_samples):
            msg = "`max_samples` must be in range 1 to {} but got value {}"
            raise ValueError(msg.format(n_samples, max_samples))
        return float(max_samples / n_samples)

    if isinstance(max_samples, numbers.Real):
        if sklearn_check_version('1.0'):
            if not (0 < float(max_samples) <= 1):
                msg = "`max_samples` must be in range (0.0, 1.0] but got value {}"
                raise ValueError(msg.format(max_samples))
        else:
            if not (0 < float(max_samples) < 1):
                msg = "`max_samples` must be in range (0, 1) but got value {}"
                raise ValueError(msg.format(max_samples))
        return float(max_samples)

    msg = "`max_samples` should be int or float, but got type '{}'"
    raise TypeError(msg.format(type(max_samples)))
예제 #12
0
#===============================================================================
# Copyright 2014-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

from daal4py.sklearn._utils import sklearn_check_version

if sklearn_check_version('0.23'):
    from ._svm_0_23 import *
elif sklearn_check_version('0.22'):
    from ._svm_0_22 import *
예제 #13
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    rs_ = check_random_state(self.random_state)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                 X.shape[1],
                                                 is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if sample_weight is not None:
        sample_weight = [sample_weight]

    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype=getFPType(X),
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap
        if self.bootstrap is True else 1.,
        featuresPerNode=int(_featuresPerNode),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)

    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
예제 #14
0
class Lasso(ElasticNet):
    __doc__ = Lasso_original.__doc__

    def __init__(
        self,
        alpha=1.0,
        fit_intercept=True,
        normalize="deprecated" if sklearn_check_version('1.0') else False,
        precompute=False,
        copy_X=True,
        max_iter=1000,
        tol=1e-4,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='cyclic',
    ):
        super().__init__(
            alpha=alpha,
            l1_ratio=1.0,
            fit_intercept=fit_intercept,
            normalize=normalize,
            precompute=precompute,
            copy_X=copy_X,
            max_iter=max_iter,
            tol=tol,
            warm_start=warm_start,
            positive=positive,
            random_state=random_state,
            selection=selection,
        )

    if sklearn_check_version('0.23'):
        @support_usm_ndarray()
        def fit(self, X, y, sample_weight=None, check_input=True):
            return _fit(self, X, y, sample_weight, check_input)
    else:
        @support_usm_ndarray()
        def fit(self, X, y, check_input=True):
            return _fit(self, X, y, check_input)

    @support_usm_ndarray()
    def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """

        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        if not hasattr(self, 'daal_model_') or \
                sp.issparse(X) or \
                not good_shape_for_daal:
            logging.info(
                "sklearn.linear_model.Lasso."
                "predict: " + get_patch_message("sklearn"))
            return self._decision_function(X)
        logging.info(
            "sklearn.linear_model.Lasso."
            "predict: " + get_patch_message("daal"))
        return _daal4py_predict_lasso(self, X)
예제 #15
0
class ElasticNet(ElasticNet_original):
    __doc__ = ElasticNet_original.__doc__

    def __init__(
        self,
        alpha=1.0,
        l1_ratio=0.5,
        fit_intercept=True,
        normalize="deprecated" if sklearn_check_version('1.0') else False,
        precompute=False,
        max_iter=1000,
        copy_X=True,
        tol=1e-4,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='cyclic',
    ):
        super(ElasticNet, self).__init__(
            alpha=alpha,
            l1_ratio=l1_ratio,
            fit_intercept=fit_intercept,
            normalize=normalize,
            precompute=precompute,
            max_iter=max_iter,
            copy_X=copy_X,
            tol=tol,
            warm_start=warm_start,
            positive=positive,
            random_state=random_state,
            selection=selection,
        )

    if sklearn_check_version('0.23'):
        @support_usm_ndarray()
        def fit(self, X, y, sample_weight=None, check_input=True):
            return _fit(self, X, y, sample_weight=sample_weight, check_input=check_input)
    else:
        @support_usm_ndarray()
        def fit(self, X, y, check_input=True):
            return _fit(self, X, y, check_input=check_input)

    @support_usm_ndarray()
    def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """

        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        if not hasattr(self, 'daal_model_') or \
                sp.issparse(X) or \
                not good_shape_for_daal:
            logging.info(
                "sklearn.linear_model.ElasticNet."
                "predict: " + get_patch_message("sklearn"))
            return self._decision_function(X)
        logging.info(
            "sklearn.linear_model.ElasticNet."
            "predict: " + get_patch_message("daal"))
        return _daal4py_predict_enet(self, X)

    @property
    def dual_gap_(self):
        if (self._gap is None):
            l1_reg = self.alpha * self.l1_ratio * self._X.shape[0]
            l2_reg = self.alpha * (1.0 - self.l1_ratio) * self._X.shape[0]
            n_targets = self._y.shape[1]

            if (n_targets == 1):
                self._gap = self.tol + 1.0
                X_offset = np.average(self._X, axis=0)
                y_offset = np.average(self._y, axis=0)
                coef = np.reshape(self.coef_, (self.coef_.shape[0], 1))
                R = (self._y - y_offset) - np.dot((self._X - X_offset), coef)
                XtA = np.dot((self._X - X_offset).T, R) - l2_reg * coef
                R_norm2 = np.dot(R.T, R)
                coef_norm2 = np.dot(self.coef_, self.coef_)
                dual_norm_XtA = np.max(
                    XtA) if self.positive else np.max(np.abs(XtA))
                if dual_norm_XtA > l1_reg:
                    const = l1_reg / dual_norm_XtA
                    A_norm2 = R_norm2 * (const ** 2)
                    self._gap = 0.5 * (R_norm2 + A_norm2)
                else:
                    const = 1.0
                    self._gap = R_norm2
                l1_norm = np.sum(np.abs(self.coef_))
                tmp = l1_reg * l1_norm
                tmp -= const * np.dot(R.T, (self._y - y_offset))
                tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2
                self._gap += tmp
                self._gap = self._gap[0][0]
            else:
                self._gap = np.full(n_targets, self.tol + 1.0)
                X_offset = np.average(self._X, axis=0)
                y_offset = np.average(self._y, axis=0)
                for k in range(n_targets):
                    R = (self._y[:, k] - y_offset[k]) - \
                        np.dot((self._X - X_offset), self.coef_[k, :].T)
                    XtA = np.dot((self._X - X_offset).T, R) - \
                        l2_reg * self.coef_[k, :].T
                    R_norm2 = np.dot(R.T, R)
                    coef_norm2 = np.dot(self.coef_[k, :], self.coef_[k, :].T)
                    dual_norm_XtA = np.max(
                        XtA) if self.positive else np.max(np.abs(XtA))
                    if dual_norm_XtA > l1_reg:
                        const = l1_reg / dual_norm_XtA
                        A_norm2 = R_norm2 * (const ** 2)
                        self._gap[k] = 0.5 * (R_norm2 + A_norm2)
                    else:
                        const = 1.0
                        self._gap[k] = R_norm2
                    l1_norm = np.sum(np.abs(self.coef_[k, :]))
                    tmp = l1_reg * l1_norm
                    tmp -= const * np.dot(R.T, (self._y[:, k] - y_offset[k]))
                    tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2
                    self._gap[k] += tmp
        return self._gap

    @dual_gap_.setter
    def dual_gap_(self, value):
        self._gap = value

    @dual_gap_.deleter
    def dual_gap_(self):
        self._gap = None
예제 #16
0
def _fit_regressor(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError(
            "sparse multilabel-indicator for y is not supported."
        )
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = check_sample_weight(sample_weight, X)

    if sklearn_check_version('1.0') and self.criterion == "mse":
        warnings.warn(
            "Criterion 'mse' was deprecated in v1.0 and will be "
            "removed in version 1.2. Use `criterion='squared_error'` "
            "which is equivalent.",
            FutureWarning
        )

    _patching_status = PatchingConditionsChain(
        "sklearn.ensemble.RandomForestRegressor.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.oob_score and daal_check_version((2021, 'P', 500)) or not self.oob_score,
            "OOB score is only supported starting from 2021.5 version of oneDAL."),
        (self.warm_start is False, "Warm start is not supported."),
        (self.criterion in ["mse", "squared_error"],
            f"'{self.criterion}' criterion is not supported. "
            "Only 'mse' and 'squared_error' criteria are supported."),
        (self.ccp_alpha == 0.0,
            f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."),
        (not sp.issparse(X), "X is sparse. Sparse input is not supported.")
    ])

    if _dal_ready:
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=True)
        X = check_array(X, dtype=[np.float64, np.float32])
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                          " expected. Please change the shape of y to "
                          "(n_samples,), for example using ravel().",
                          DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        _dal_ready = _patching_status.and_conditions([
            (self.n_outputs_ == 1, f"Number of outputs ({self.n_outputs_}) is not 1.")])

    _patching_status.write_log()
    if _dal_ready:
        _daal_fit_regressor(self, X, y, sample_weight=sample_weight)

        self.estimators_ = self._estimators_
        return self
    return super(RandomForestRegressor, self).fit(
        X, y, sample_weight=sample_weight)
예제 #17
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit model with coordinate descent.

        Parameters
        ----------
        X : {ndarray, sparse matrix} of (n_samples, n_features)
            Data

        y : {ndarray, sparse matrix} of shape (n_samples,) or \
            (n_samples, n_targets)
            Target. Will be cast to X's dtype if necessary

        sample_weight : float or array-like of shape (n_samples,), default=None
            Sample weight.

        check_input : bool, default=True
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Notes
        -----

        Coordinate descent is an algorithm that considers each column of
        data at a time hence it will automatically convert the X input
        as a Fortran-contiguous numpy array if necessary.

        To avoid memory re-allocation it is advised to allocate the
        initial data in memory directly using that format.
        """
        # check X and y
        if check_input:
            X, y = check_X_y(X,
                             y,
                             copy=False,
                             accept_sparse='csc',
                             dtype=[np.float64, np.float32],
                             multi_output=True,
                             y_numeric=True)
            y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)
        else:
            # only for compliance with Sklearn,
            # this assert is not required for Intel(R) oneAPI Data
            # Analytics Library
            if isinstance(X, np.ndarray) and \
                    X.flags['F_CONTIGUOUS'] is False:
                raise ValueError("ndarray is not Fortran contiguous")

        if isinstance(X, np.ndarray):
            self.fit_shape_good_for_daal_ = True if X.ndim <= 1 else True if X.shape[
                0] >= X.shape[1] else False
        else:
            self.fit_shape_good_for_daal_ = False

        if sp.issparse(X) or \
                sample_weight is not None or \
                not self.fit_shape_good_for_daal_ or \
                not (X.dtype == np.float64 or X.dtype == np.float32):
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso."
                         "fit: " + get_patch_message("sklearn"))
            res_new = super(ElasticNet, self).fit(X,
                                                  y,
                                                  sample_weight=sample_weight,
                                                  check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new

        if sklearn_check_version('1.0'):
            self.normalize = _deprecate_normalize(
                self.normalize,
                default=False,
                estimator_name=self.__class__.__name__)

        self.n_iter_ = None
        self._gap = None
        # only for pass tests
        # "check_estimators_fit_returns_self(readonly_memmap=True) and
        # check_regressors_train(readonly_memmap=True)
        if not (X.flags.writeable):
            X = np.copy(X)
        if not (y.flags.writeable):
            y = np.copy(y)
        logging.info("sklearn.linear_model.Lasso."
                     "fit: " + get_patch_message("daal"))
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
        if res is None:
            if hasattr(self, 'daal_model_'):
                del self.daal_model_
            logging.info("sklearn.linear_model.Lasso."
                         "fit: " + get_patch_message("sklearn_after_daal"))
            res_new = super(ElasticNet, self).fit(X,
                                                  y,
                                                  sample_weight=sample_weight,
                                                  check_input=check_input)
            self._gap = res_new.dual_gap_
            return res_new
        return res
예제 #18
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    features_per_node_ = _to_absolute_max_features(
        self.max_features, X.shape[1], is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0],
        max_samples=self.max_samples
    )

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf
                                   if isinstance(
                                       self.min_samples_leaf, numbers.Integral)
                                   else int(ceil(
                                       self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(
            0.0 if self.min_impurity_split is None else self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute=(
            "computeOutOfBagErrorAccuracy|computeOutOfBagErrorDecisionFunction"
            if self.oob_score
            else ""),
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split
                                    if isinstance(
                                        self.min_samples_split, numbers.Integral)
                                    else int(ceil(
                                        self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize
    )
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    if self.oob_score:
        self.oob_score_ = dfc_trainingResult.outOfBagErrorAccuracy[0][0]
        self.oob_decision_function_ = dfc_trainingResult.outOfBagErrorDecisionFunction
        if self.oob_decision_function_.shape[-1] == 1:
            self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)

    return self
예제 #19
0
 def __init__(self,
              n_estimators=100,
              *,
              criterion="mse",
              max_depth=None,
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.,
              max_features="auto",
              max_leaf_nodes=None,
              min_impurity_decrease=0.,
              min_impurity_split=None,
              bootstrap=True,
              oob_score=False,
              n_jobs=None,
              random_state=None,
              verbose=0,
              warm_start=False,
              ccp_alpha=0.0,
              max_samples=None,
              maxBins=256,
              minBinSize=1):
     if sklearn_check_version('0.21'):
         super(RandomForestRegressor, self).__init__(
             n_estimators=n_estimators,
             criterion=criterion,
             max_depth=max_depth,
             min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
             min_impurity_decrease=min_impurity_decrease,
             min_impurity_split=min_impurity_split,
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start)
         self.ccp_alpha = ccp_alpha
         self.max_samples = max_samples
     else:
         super(RandomForestRegressor, self).__init__(
             n_estimators=n_estimators,
             criterion=criterion,
             max_depth=max_depth,
             min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features,
             max_leaf_nodes=max_leaf_nodes,
             min_impurity_decrease=min_impurity_decrease,
             min_impurity_split=min_impurity_split,
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
             ccp_alpha=ccp_alpha,
             max_samples=max_samples)
     self.maxBins = maxBins
     self.minBinSize = minBinSize
예제 #20
0
def _fit(self, X, y, sample_weight=None, check_input=True):
    # check X and y
    if check_input:
        X, y = check_X_y(
            X,
            y,
            copy=False,
            accept_sparse='csc',
            dtype=[np.float64, np.float32],
            multi_output=True,
            y_numeric=True,
        )
        y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)

    if not sp.issparse(X):
        self.fit_shape_good_for_daal_ = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False
    else:
        self.fit_shape_good_for_daal_ = False

    log_str = "sklearn.linear_model." + self.__class__.__name__ + ".fit: "
    sklearn_ready = sp.issparse(X) or not self.fit_shape_good_for_daal_ or \
        X.dtype not in [np.float64, np.float32] or sample_weight is not None

    if sklearn_ready:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    self.n_iter_ = None
    self._gap = None

    if not check_input:
        # only for compliance with Sklearn,
        # this assert is not required for Intel(R) oneAPI Data
        # Analytics Library
        print(type(X), X.flags['F_CONTIGUOUS'])
        if isinstance(X, np.ndarray) and \
                X.flags['F_CONTIGUOUS'] is False:
            # print(X.flags)
            raise ValueError("ndarray is not Fortran contiguous")

    if sklearn_check_version('1.0'):
        self._normalize = _deprecate_normalize(
            self.normalize,
            default=False,
            estimator_name=self.__class__.__name__)

    # only for pass tests
    # "check_estimators_fit_returns_self(readonly_memmap=True) and
    # check_regressors_train(readonly_memmap=True)
    if not X.flags.writeable:
        X = np.copy(X)
    if not y.flags.writeable:
        y = np.copy(y)
    logging.info(log_str + get_patch_message("daal"))

    if self.__class__.__name__ == "ElasticNet":
        res = _daal4py_fit_enet(self, X, y, check_input=check_input)
    else:
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
    if res is None:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            log_str + get_patch_message("sklearn_after_daal")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    return res
예제 #21
0
    def _fit(self, X, skip_num_points=0):
        """Private function to fit the model using X as training data."""
        if isinstance(self.init, str) and self.init == 'warn':
            warnings.warn(
                "The default initialization in TSNE will change "
                "from 'random' to 'pca' in 1.2.", FutureWarning)
            self._init = 'random'
        else:
            self._init = self.init

        if isinstance(self._init, str) and self._init == 'pca' and issparse(X):
            raise TypeError("PCA initialization is currently not suported "
                            "with the sparse input matrix. Use "
                            "init=\"random\" instead.")

        if self.method not in ['barnes_hut', 'exact']:
            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
        if self.angle < 0.0 or self.angle > 1.0:
            raise ValueError("'angle' must be between 0.0 - 1.0")
        if self.learning_rate == 'warn':
            warnings.warn(
                "The default learning rate in TSNE will change "
                "from 200.0 to 'auto' in 1.2.", FutureWarning)
            self._learning_rate = 200.0
        else:
            self._learning_rate = self.learning_rate
        if self._learning_rate == 'auto':
            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
            self._learning_rate = np.maximum(self._learning_rate, 50)
        else:
            if not (self._learning_rate > 0):
                raise ValueError("'learning_rate' must be a positive number "
                                 "or 'auto'.")

        if hasattr(self, 'square_distances'):
            if self.square_distances not in [True, 'legacy']:
                raise ValueError(
                    "'square_distances' must be True or 'legacy'.")
            if self.metric != "euclidean" and self.square_distances is not True:
                warnings.warn(
                    ("'square_distances' has been introduced in 0.24"
                     "to help phase out legacy squaring behavior. The "
                     "'legacy' setting will be removed in 0.26, and the "
                     "default setting will be changed to True. In 0.28, "
                     "'square_distances' will be removed altogether,"
                     "and distances will be squared by default. Set "
                     "'square_distances'=True to silence this warning."),
                    FutureWarning)

        if self.method == 'barnes_hut':
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr'],
                                        ensure_min_samples=2,
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr'],
                                ensure_min_samples=2,
                                dtype=[np.float32, np.float64])
        else:
            if sklearn_check_version('0.23'):
                X = self._validate_data(X,
                                        accept_sparse=['csr', 'csc', 'coo'],
                                        dtype=[np.float32, np.float64])
            else:
                X = check_array(X,
                                accept_sparse=['csr', 'csc', 'coo'],
                                dtype=[np.float32, np.float64])

        if self.metric == "precomputed":
            if isinstance(self._init, str) and self._init == 'pca':
                raise ValueError("The parameter init=\"pca\" cannot be "
                                 "used with metric=\"precomputed\".")
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square distance matrix")

            check_non_negative(
                X, "TSNE.fit(). With metric='precomputed', X "
                "should contain positive distances.")

            if self.method == "exact" and issparse(X):
                raise TypeError(
                    'TSNE with method="exact" does not accept sparse '
                    'precomputed distance matrix. Use method="barnes_hut" '
                    'or provide the dense distance matrix.')

        if self.method == 'barnes_hut' and self.n_components > 3:
            raise ValueError("'n_components' should be inferior to 4 for the "
                             "barnes_hut algorithm as it relies on "
                             "quad-tree or oct-tree.")
        random_state = check_random_state(self.random_state)

        if self.early_exaggeration < 1.0:
            raise ValueError(
                "early_exaggeration must be at least 1, but is {}".format(
                    self.early_exaggeration))

        if self.n_iter < 250:
            raise ValueError("n_iter should be at least 250")

        n_samples = X.shape[0]

        neighbors_nn = None
        if self.method == "exact":
            # Retrieve the distance matrix, either using the precomputed one or
            # computing it.
            if self.metric == "precomputed":
                distances = X
            else:
                if self.verbose:
                    print("[t-SNE] Computing pairwise distances...")

                if self.metric == "euclidean":
                    # Euclidean is squared here, rather than using **= 2,
                    # because euclidean_distances already calculates
                    # squared distances, and returns np.sqrt(dist) for
                    # squared=False.
                    # Also, Euclidean is slower for n_jobs>1, so don't set here
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   squared=True)
                else:
                    distances = pairwise_distances(X,
                                                   metric=self.metric,
                                                   n_jobs=self.n_jobs)

            if np.any(distances < 0):
                raise ValueError("All distances should be positive, the "
                                 "metric given is not correct")

            if self.metric != "euclidean" and \
                    getattr(self, 'square_distances', True) is True:
                distances **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities(distances, self.perplexity, self.verbose)
            assert np.all(np.isfinite(P)), "All probabilities should be finite"
            assert np.all(P >= 0), "All probabilities should be non-negative"
            assert np.all(P <= 1), ("All probabilities should be less "
                                    "or then equal to one")

        else:
            # Compute the number of nearest neighbors to find.
            # LvdM uses 3 * perplexity as the number of neighbors.
            # In the event that we have very small # of points
            # set the neighbors to n - 1.
            n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1))

            if self.verbose:
                print("[t-SNE] Computing {} nearest neighbors...".format(
                    n_neighbors))

            # Find the nearest neighbors for every point
            knn = NearestNeighbors(
                algorithm='auto',
                n_jobs=self.n_jobs,
                n_neighbors=n_neighbors,
                metric=self.metric,
            )
            t0 = time()
            knn.fit(X)
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
                    n_samples, duration))

            t0 = time()
            distances_nn = knn.kneighbors_graph(mode='distance')
            duration = time() - t0
            if self.verbose:
                print("[t-SNE] Computed neighbors for {} samples "
                      "in {:.3f}s...".format(n_samples, duration))

            # Free the memory used by the ball_tree
            del knn

            if getattr(self, 'square_distances', True) is True or \
                    self.metric == "euclidean":
                # knn return the euclidean distance but we need it squared
                # to be consistent with the 'exact' method. Note that the
                # the method was derived using the euclidean method as in the
                # input space. Not sure of the implication of using a different
                # metric.
                distances_nn.data **= 2

            # compute the joint probability distribution for the input space
            P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                        self.verbose)

        if isinstance(self._init, np.ndarray):
            X_embedded = self._init
        elif self._init == 'pca':
            pca = PCA(
                n_components=self.n_components,
                svd_solver='randomized',
                random_state=random_state,
            )
            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
            warnings.warn(
                "The PCA initialization in TSNE will change to "
                "have the standard deviation of PC1 equal to 1e-4 "
                "in 1.2. This will ensure better convergence.", FutureWarning)
        elif self._init == 'random':
            # The embedding is initialized with iid samples from Gaussians with
            # standard deviation 1e-4.
            X_embedded = 1e-4 * random_state.randn(
                n_samples, self.n_components).astype(np.float32)
        else:
            raise ValueError("'init' must be 'pca', 'random', or "
                             "a numpy array")

        # Degrees of freedom of the Student's t-distribution. The suggestion
        # degrees_of_freedom = n_components - 1 comes from
        # "Learning a Parametric Embedding by Preserving Local Structure"
        # Laurens van der Maaten, 2009.
        degrees_of_freedom = max(self.n_components - 1, 1)

        daal_ready = self.method == 'barnes_hut' and self.n_components == 2 and \
            self.verbose == 0 and daal_check_version((2021, 'P', 600))

        if daal_ready:
            X_embedded = check_array(X_embedded,
                                     dtype=[np.float32, np.float64])
            return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
        return self._tsne(P,
                          degrees_of_freedom,
                          n_samples,
                          X_embedded=X_embedded,
                          neighbors=neighbors_nn,
                          skip_num_points=skip_num_points)
예제 #22
0
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_

        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        classes_ = self.classes_[0]
        n_classes_ = self.n_classes_[0]
        # convert model to estimators
        params = {
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'random_state': None,
        }
        if not sklearn_check_version('1.0'):
            params['min_impurity_split'] = self.min_impurity_split
        est = DecisionTreeClassifier(**params)
        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = classes_
            est_i.n_classes_ = n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth',
            # 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i,
                                                      n_classes_)

            # node_ndarray = tree_i_state_class.node_ar
            # value_ndarray = tree_i_state_class.value_ar
            # value_shape = (node_ndarray.shape[0], self.n_outputs_,
            #                n_classes_)
            # assert np.allclose(
            #     value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')
            # ), "Value array is non-integer"
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            est_i.tree_ = Tree(self.n_features_in_,
                               np.array([n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self._cached_estimators_ = estimators_
        return estimators_
예제 #23
0
class RandomForestClassifier(RandomForestClassifier_original):
    __doc__ = RandomForestClassifier_original.__doc__

    if sklearn_check_version('1.0'):

        def __init__(self,
                     n_estimators=100,
                     criterion="gini",
                     max_depth=None,
                     min_samples_split=2,
                     min_samples_leaf=1,
                     min_weight_fraction_leaf=0.,
                     max_features="auto",
                     max_leaf_nodes=None,
                     min_impurity_decrease=0.,
                     bootstrap=True,
                     oob_score=False,
                     n_jobs=None,
                     random_state=None,
                     verbose=0,
                     warm_start=False,
                     class_weight=None,
                     ccp_alpha=0.0,
                     max_samples=None,
                     maxBins=256,
                     minBinSize=1):
            super(RandomForestClassifier, self).__init__(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_features=max_features,
                max_leaf_nodes=max_leaf_nodes,
                min_impurity_decrease=min_impurity_decrease,
                bootstrap=bootstrap,
                oob_score=oob_score,
                n_jobs=n_jobs,
                random_state=random_state,
                verbose=verbose,
                warm_start=warm_start,
                class_weight=class_weight)
            self.ccp_alpha = ccp_alpha
            self.max_samples = max_samples
            self.maxBins = maxBins
            self.minBinSize = minBinSize
            self.min_impurity_split = None
    else:

        def __init__(self,
                     n_estimators=100,
                     criterion="gini",
                     max_depth=None,
                     min_samples_split=2,
                     min_samples_leaf=1,
                     min_weight_fraction_leaf=0.,
                     max_features="auto",
                     max_leaf_nodes=None,
                     min_impurity_decrease=0.,
                     min_impurity_split=None,
                     bootstrap=True,
                     oob_score=False,
                     n_jobs=None,
                     random_state=None,
                     verbose=0,
                     warm_start=False,
                     class_weight=None,
                     ccp_alpha=0.0,
                     max_samples=None,
                     maxBins=256,
                     minBinSize=1):
            super(RandomForestClassifier, self).__init__(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_features=max_features,
                max_leaf_nodes=max_leaf_nodes,
                min_impurity_decrease=min_impurity_decrease,
                min_impurity_split=min_impurity_split,
                bootstrap=bootstrap,
                oob_score=oob_score,
                n_jobs=n_jobs,
                random_state=random_state,
                verbose=verbose,
                warm_start=warm_start,
                class_weight=class_weight,
                ccp_alpha=ccp_alpha,
                max_samples=max_samples)
            self.maxBins = maxBins
            self.minBinSize = minBinSize

    def fit(self, X, y, sample_weight=None):
        """
        Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
        """
        return _fit_classifier(self, X, y, sample_weight=sample_weight)

    def predict(self, X):
        """
        Predict class for X.

        The predicted class of an input sample is a vote by the trees in
        the forest, weighted by their probability estimates. That is,
        the predicted class is the one with highest mean probability
        estimate across the trees.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
            The predicted classes.
        """

        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=[np.float64, np.float32])

        if not hasattr(self, 'daal_model_') or \
                sp.issparse(X) or self.n_outputs_ != 1:
            logging.info("sklearn.ensemble.RandomForestClassifier."
                         "predict: " + get_patch_message("sklearn"))
            return super(RandomForestClassifier, self).predict(X)
        logging.info("sklearn.ensemble.RandomForestClassifier."
                     "predict: " + get_patch_message("daal"))
        return _daal_predict_classifier(self, X)

    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        # Temporary solution
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        if hasattr(self, 'n_features_in_'):
            if X.shape[1] != self.n_features_in_:
                raise ValueError((f'X has {X.shape[1]} features, '
                                  f'but RandomForestClassifier is expecting '
                                  f'{self.n_features_in_} features as input'))
        logging.info("sklearn.ensemble.RandomForestClassifier."
                     "predict_proba: " + get_patch_message("sklearn"))
        return super(RandomForestClassifier, self).predict_proba(X)

        #X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
        #                dtype=[np.float64, np.float32])
        #if not hasattr(self, 'daal_model_') or \
        #        sp.issparse(X) or self.n_outputs_ != 1 or \
        #        not daal_check_version((2021, 'P', 200)):
        #    logging.info(
        #        "sklearn.ensemble.RandomForestClassifier."
        #        "predict_proba: " + get_patch_message("sklearn"))
        #    return super(RandomForestClassifier, self).predict_proba(X)
        #logging.info(
        #    "sklearn.ensemble.RandomForestClassifier."
        #    "predict_proba: " + get_patch_message("daal"))
        #return _daal_predict_proba(self, X)

    if sklearn_check_version('1.0'):

        @deprecated(
            "Attribute `n_features_` was deprecated in version 1.0 and will be "
            "removed in 1.2. Use `n_features_in_` instead.")
        @property
        def n_features_(self):
            return self.n_features_in_

    @property
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_

        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        classes_ = self.classes_[0]
        n_classes_ = self.n_classes_[0]
        # convert model to estimators
        params = {
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'max_features': self.max_features,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'random_state': None,
        }
        if not sklearn_check_version('1.0'):
            params['min_impurity_split'] = self.min_impurity_split
        est = DecisionTreeClassifier(**params)
        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = classes_
            est_i.n_classes_ = n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth',
            # 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i,
                                                      n_classes_)

            # node_ndarray = tree_i_state_class.node_ar
            # value_ndarray = tree_i_state_class.value_ar
            # value_shape = (node_ndarray.shape[0], self.n_outputs_,
            #                n_classes_)
            # assert np.allclose(
            #     value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')
            # ), "Value array is non-integer"
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            est_i.tree_ = Tree(self.n_features_in_,
                               np.array([n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self._cached_estimators_ = estimators_
        return estimators_
예제 #24
0
class ElasticNet(ElasticNet_original):
    __doc__ = ElasticNet_original.__doc__

    def __init__(
        self,
        alpha=1.0,
        l1_ratio=0.5,
        fit_intercept=True,
        normalize="deprecated" if sklearn_check_version('1.0') else False,
        precompute=False,
        max_iter=1000,
        copy_X=True,
        tol=1e-4,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='cyclic',
    ):
        super(ElasticNet, self).__init__(
            alpha=alpha,
            l1_ratio=l1_ratio,
            fit_intercept=fit_intercept,
            normalize=normalize,
            precompute=precompute,
            max_iter=max_iter,
            copy_X=copy_X,
            tol=tol,
            warm_start=warm_start,
            positive=positive,
            random_state=random_state,
            selection=selection,
        )

    if sklearn_check_version('0.23'):
        @support_usm_ndarray()
        def fit(self, X, y, sample_weight=None, check_input=True):
            """
            Fit model with coordinate descent.

            Parameters
            ----------
            X : {ndarray, sparse matrix} of (n_samples, n_features)
                Data.

            y : {ndarray, sparse matrix} of shape (n_samples,) or \
                (n_samples, n_targets)
                Target. Will be cast to X's dtype if necessary.

            sample_weight : float or array-like of shape (n_samples,), default=None
                Sample weights. Internally, the `sample_weight` vector will be
                rescaled to sum to `n_samples`.

                .. versionadded:: 0.23

            check_input : bool, default=True
                Allow to bypass several input checking.
                Don't use this parameter unless you know what you do.

            Returns
            -------
            self : object
                Fitted estimator.

            Notes
            -----
            Coordinate descent is an algorithm that considers each column of
            data at a time hence it will automatically convert the X input
            as a Fortran-contiguous numpy array if necessary.

            To avoid memory re-allocation it is advised to allocate the
            initial data in memory directly using that format.
            """
            return _fit(self, X, y, sample_weight=sample_weight, check_input=check_input)
    else:
        @support_usm_ndarray()
        def fit(self, X, y, check_input=True):
            """
            Fit model with coordinate descent.

            Parameters
            ----------
            X : ndarray or scipy.sparse matrix, (n_samples, n_features)
                Data

            y : ndarray, shape (n_samples,) or (n_samples, n_targets)
                Target. Will be cast to X's dtype if necessary

            check_input : boolean, (default=True)
                Allow to bypass several input checking.
                Don't use this parameter unless you know what you do.

            Notes
            -----

            Coordinate descent is an algorithm that considers each column of
            data at a time hence it will automatically convert the X input
            as a Fortran-contiguous numpy array if necessary.

            To avoid memory re-allocation it is advised to allocate the
            initial data in memory directly using that format.
            """
            return _fit(self, X, y, check_input=check_input)

    @support_usm_ndarray()
    def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """

        if sklearn_check_version('1.0'):
            self._check_feature_names(X, reset=False)

        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        _patching_status = PatchingConditionsChain(
            "sklearn.linear_model.ElasticNet.predict")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (good_shape_for_daal,
                "The shape of X does not satisfy oneDAL requirements: "
                "number of features > number of samples.")])
        _patching_status.write_log()

        if not _dal_ready:
            return self._decision_function(X)
        return _daal4py_predict_enet(self, X)

    @property
    def dual_gap_(self):
        if (self._gap is None):
            l1_reg = self.alpha * self.l1_ratio * self._X.shape[0]
            l2_reg = self.alpha * (1.0 - self.l1_ratio) * self._X.shape[0]
            n_targets = self._y.shape[1]

            if (n_targets == 1):
                self._gap = self.tol + 1.0
                X_offset = np.average(self._X, axis=0)
                y_offset = np.average(self._y, axis=0)
                coef = np.reshape(self.coef_, (self.coef_.shape[0], 1))
                R = (self._y - y_offset) - np.dot((self._X - X_offset), coef)
                XtA = np.dot((self._X - X_offset).T, R) - l2_reg * coef
                R_norm2 = np.dot(R.T, R)
                coef_norm2 = np.dot(self.coef_, self.coef_)
                dual_norm_XtA = np.max(
                    XtA) if self.positive else np.max(np.abs(XtA))
                if dual_norm_XtA > l1_reg:
                    const = l1_reg / dual_norm_XtA
                    A_norm2 = R_norm2 * (const ** 2)
                    self._gap = 0.5 * (R_norm2 + A_norm2)
                else:
                    const = 1.0
                    self._gap = R_norm2
                l1_norm = np.sum(np.abs(self.coef_))
                tmp = l1_reg * l1_norm
                tmp -= const * np.dot(R.T, (self._y - y_offset))
                tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2
                self._gap += tmp
                self._gap = self._gap[0][0]
            else:
                self._gap = np.full(n_targets, self.tol + 1.0)
                X_offset = np.average(self._X, axis=0)
                y_offset = np.average(self._y, axis=0)
                for k in range(n_targets):
                    R = (self._y[:, k] - y_offset[k]) - \
                        np.dot((self._X - X_offset), self.coef_[k, :].T)
                    XtA = np.dot((self._X - X_offset).T, R) - \
                        l2_reg * self.coef_[k, :].T
                    R_norm2 = np.dot(R.T, R)
                    coef_norm2 = np.dot(self.coef_[k, :], self.coef_[k, :].T)
                    dual_norm_XtA = np.max(
                        XtA) if self.positive else np.max(np.abs(XtA))
                    if dual_norm_XtA > l1_reg:
                        const = l1_reg / dual_norm_XtA
                        A_norm2 = R_norm2 * (const ** 2)
                        self._gap[k] = 0.5 * (R_norm2 + A_norm2)
                    else:
                        const = 1.0
                        self._gap[k] = R_norm2
                    l1_norm = np.sum(np.abs(self.coef_[k, :]))
                    tmp = l1_reg * l1_norm
                    tmp -= const * np.dot(R.T, (self._y[:, k] - y_offset[k]))
                    tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2
                    self._gap[k] += tmp
        return self._gap

    @dual_gap_.setter
    def dual_gap_(self, value):
        self._gap = value

    @dual_gap_.deleter
    def dual_gap_(self):
        self._gap = None
예제 #25
0
def _fit(self, X, y, sample_weight=None, check_input=True):
    if sklearn_check_version('1.0'):
        self._check_feature_names(X, reset=True)
    # check X and y
    if check_input:
        X, y = check_X_y(
            X,
            y,
            copy=False,
            accept_sparse='csc',
            dtype=[np.float64, np.float32],
            multi_output=True,
            y_numeric=True,
        )
        y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False)

    if not sp.issparse(X):
        self.fit_shape_good_for_daal_ = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False
    else:
        self.fit_shape_good_for_daal_ = False

    _function_name = f"sklearn.linear_model.{self.__class__.__name__}.fit"
    _patching_status = PatchingConditionsChain(
        _function_name)
    _dal_ready = _patching_status.and_conditions([
        (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
        (self.fit_shape_good_for_daal_,
            "The shape of X does not satisfy oneDAL requirements: "
            "number of features > number of samples."),
        (X.dtype == np.float64 or X.dtype == np.float32,
            f"'{X.dtype}' X data type is not supported. "
            "Only np.float32 and np.float64 are supported."),
        (sample_weight is None, "Sample weights are not supported.")])
    _patching_status.write_log()

    if not _dal_ready:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    self.n_iter_ = None
    self._gap = None

    if not check_input:
        # only for compliance with Sklearn,
        # this assert is not required for Intel(R) oneAPI Data
        # Analytics Library
        print(type(X), X.flags['F_CONTIGUOUS'])
        if isinstance(X, np.ndarray) and \
                X.flags['F_CONTIGUOUS'] is False:
            # print(X.flags)
            raise ValueError("ndarray is not Fortran contiguous")

    if sklearn_check_version('1.0'):
        self._normalize = _deprecate_normalize(
            self.normalize,
            default=False,
            estimator_name=self.__class__.__name__)

    # only for pass tests
    # "check_estimators_fit_returns_self(readonly_memmap=True) and
    # check_regressors_train(readonly_memmap=True)
    if not X.flags.writeable:
        X = np.copy(X)
    if not y.flags.writeable:
        y = np.copy(y)

    if self.__class__.__name__ == "ElasticNet":
        res = _daal4py_fit_enet(self, X, y, check_input=check_input)
    else:
        res = _daal4py_fit_lasso(self, X, y, check_input=check_input)
    if res is None:
        if hasattr(self, 'daal_model_'):
            del self.daal_model_
        logging.info(
            _function_name + ": " + get_patch_message("sklearn_after_daal")
        )
        if sklearn_check_version('0.23'):
            res_new = super(ElasticNet, self).fit(
                X, y, sample_weight=sample_weight, check_input=check_input)
        else:
            res_new = super(ElasticNet, self).fit(
                X, y, check_input=check_input)
        self._gap = res_new.dual_gap_
        return res_new
    return res
예제 #26
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

import numpy as np
import numbers
import daal4py
from scipy import sparse as sp
from sklearn.utils import check_array, check_X_y
from sklearn.linear_model._coordinate_descent import ElasticNet as ElasticNet_original
from sklearn.linear_model._coordinate_descent import Lasso as Lasso_original
from daal4py.sklearn._utils import (
    make2d, getFPType, get_patch_message, sklearn_check_version, PatchingConditionsChain)
if sklearn_check_version('1.0'):
    from sklearn.linear_model._base import _deprecate_normalize

import logging

# only for compliance with Sklearn
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import normalize

from .._device_offload import support_usm_ndarray


def _daal4py_check(self, X, y, check_input):
    _fptype = getFPType(X)
예제 #27
0
def _daal4py_fit_enet(self, X, y_, check_input):

    # appropriate checks
    _daal4py_check(self, X, y_, check_input)
    X = make2d(X)
    y = make2d(y_)
    _fptype = getFPType(X)

    # only for dual_gap computation, it is not required for Intel(R) oneAPI
    # Data Analytics Library
    self._X = X
    if sklearn_check_version('0.23'):
        self.n_features_in_ = X.shape[1]
    self._y = y

    penalty_L1 = np.asarray(self.alpha * self.l1_ratio, dtype=X.dtype)
    penalty_L2 = np.asarray(self.alpha * (1.0 - self.l1_ratio), dtype=X.dtype)
    if (penalty_L1.size != 1 or penalty_L2.size != 1):
        raise ValueError("alpha or l1_ratio length is wrong")
    penalty_L1 = penalty_L1.reshape((1, -1))
    penalty_L2 = penalty_L2.reshape((1, -1))

    #normalizing and centering
    X_offset = np.zeros(X.shape[1], dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)
    if y.ndim == 1:
        y_offset = X.dtype.type(0)
    else:
        y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    _normalize = self._normalize if sklearn_check_version('1.0') else self.normalize
    if self.fit_intercept:
        X_offset = np.average(X, axis=0)
        if _normalize:
            if self.copy_X:
                X = np.copy(X) - X_offset
            else:
                X -= X_offset
            X, X_scale = normalize(X, axis=0, copy=False, return_norm=True)
            y_offset = np.average(y, axis=0)
            y = y - y_offset

    # only for compliance with Sklearn
    if isinstance(self.precompute, np.ndarray) and self.fit_intercept and \
       not np.allclose(X_offset, np.zeros(X.shape[1])) or \
       _normalize and not np.allclose(X_scale, np.ones(X.shape[1])):
        warnings.warn("Gram matrix was provided but X was centered"
                      " to fit intercept, "
                      "or X was normalized : recomputing Gram matrix.",
                      UserWarning)

    mse_alg = daal4py.optimization_solver_mse(
        numberOfTerms=X.shape[0],
        fptype=_fptype,
        method='defaultDense'
    )
    mse_alg.setup(X, y, None)

    cd_solver = daal4py.optimization_solver_coordinate_descent(
        function=mse_alg,
        fptype=_fptype,
        method='defaultDense',
        selection=self.selection,
        seed=0 if self.random_state is None else self.random_state,
        nIterations=self.max_iter,
        positive=self.positive,
        accuracyThreshold=self.tol,
    )

    # set warm_start
    if self.warm_start and hasattr(self, "coef_") and \
            isinstance(self.coef_, np.ndarray):
        n_rows = y.shape[1]
        n_cols = X.shape[1] + 1
        inputArgument = np.zeros((n_rows, n_cols), dtype=_fptype)
        for i in range(n_rows):
            inputArgument[i][0] = self.intercept_ if (
                n_rows == 1) else self.intercept_[i]
            inputArgument[i][1:] = self.coef_[:].copy(order='C') if (
                n_rows == 1) else self.coef_[i, :].copy(order='C')
        cd_solver.setup(inputArgument)
    doUse_condition = self.copy_X is False or \
        (self.fit_intercept and _normalize and self.copy_X)
    elastic_net_alg = daal4py.elastic_net_training(
        fptype=_fptype,
        method='defaultDense',
        interceptFlag=(
            self.fit_intercept is True),
        dataUseInComputation='doUse' if doUse_condition else 'doNotUse',
        penaltyL1=penalty_L1,
        penaltyL2=penalty_L2,
        optimizationSolver=cd_solver
    )
    try:
        if isinstance(self.precompute, np.ndarray):
            elastic_net_res = elastic_net_alg.compute(
                data=X, dependentVariables=y, gramMatrix=self.precompute)
        else:
            elastic_net_res = elastic_net_alg.compute(
                data=X, dependentVariables=y)
    except RuntimeError:
        return None

    # set coef_ and intersept_ results
    elastic_net_model = elastic_net_res.model
    self.daal_model_ = elastic_net_model

    # update coefficients if normalizing and centering
    if self.fit_intercept and _normalize:
        elastic_net_model.Beta[:, 1:] = elastic_net_model.Beta[:, 1:] / X_scale
        elastic_net_model.Beta[:, 0] = (
            y_offset - np.dot(X_offset, elastic_net_model.Beta[:, 1:].T)).T

    coefs = elastic_net_model.Beta

    self.intercept_ = coefs[:, 0].copy(order='C')
    self.coef_ = coefs[:, 1:].copy(order='C')

    # only for compliance with Sklearn
    if y.shape[1] == 1:
        self.coef_ = np.ravel(self.coef_)
    self.intercept_ = np.ravel(self.intercept_)
    if self.intercept_.shape[0] == 1:
        self.intercept_ = self.intercept_[0]

    # set n_iter_
    n_iter = cd_solver.__get_result__().nIterations[0][0]
    if y.shape[1] == 1:
        self.n_iter_ = n_iter
    else:
        self.n_iter_ = np.full(y.shape[1], n_iter)

    # only for compliance with Sklearn
    if self.max_iter == n_iter + 1:
        warnings.warn("Objective did not converge. You might want to "
                      "increase the number of iterations.", ConvergenceWarning)

    return self
예제 #28
0
class Lasso(ElasticNet):
    __doc__ = Lasso_original.__doc__

    def __init__(
        self,
        alpha=1.0,
        fit_intercept=True,
        normalize="deprecated" if sklearn_check_version('1.0') else False,
        precompute=False,
        copy_X=True,
        max_iter=1000,
        tol=1e-4,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='cyclic',
    ):
        super().__init__(
            alpha=alpha,
            l1_ratio=1.0,
            fit_intercept=fit_intercept,
            normalize=normalize,
            precompute=precompute,
            copy_X=copy_X,
            max_iter=max_iter,
            tol=tol,
            warm_start=warm_start,
            positive=positive,
            random_state=random_state,
            selection=selection,
        )

    if sklearn_check_version('0.23'):
        @support_usm_ndarray()
        def fit(self, X, y, sample_weight=None, check_input=True):
            """
            Fit model with coordinate descent.

            Parameters
            ----------
            X : {ndarray, sparse matrix} of (n_samples, n_features)
                Data.

            y : {ndarray, sparse matrix} of shape (n_samples,) or \
                (n_samples, n_targets)
                Target. Will be cast to X's dtype if necessary.

            sample_weight : float or array-like of shape (n_samples,), default=None
                Sample weights. Internally, the `sample_weight` vector will be
                rescaled to sum to `n_samples`.

                .. versionadded:: 0.23

            check_input : bool, default=True
                Allow to bypass several input checking.
                Don't use this parameter unless you know what you do.

            Returns
            -------
            self : object
                Fitted estimator.

            Notes
            -----
            Coordinate descent is an algorithm that considers each column of
            data at a time hence it will automatically convert the X input
            as a Fortran-contiguous numpy array if necessary.

            To avoid memory re-allocation it is advised to allocate the
            initial data in memory directly using that format.
            """
            return _fit(self, X, y, sample_weight, check_input)
    else:
        @support_usm_ndarray()
        def fit(self, X, y, check_input=True):
            """
            Fit model with coordinate descent.

            Parameters
            ----------
            X : ndarray or scipy.sparse matrix, (n_samples, n_features)
                Data

            y : ndarray, shape (n_samples,) or (n_samples, n_targets)
                Target. Will be cast to X's dtype if necessary

            check_input : boolean, (default=True)
                Allow to bypass several input checking.
                Don't use this parameter unless you know what you do.

            Notes
            -----

            Coordinate descent is an algorithm that considers each column of
            data at a time hence it will automatically convert the X input
            as a Fortran-contiguous numpy array if necessary.

            To avoid memory re-allocation it is advised to allocate the
            initial data in memory directly using that format.
            """
            return _fit(self, X, y, check_input)

    @support_usm_ndarray()
    def predict(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : array-like or sparse matrix, shape = (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape = (n_samples,)
            Returns predicted values.
        """
        if sklearn_check_version('1.0'):
            self._check_feature_names(X, reset=False)
        X = check_array(
            X,
            accept_sparse=['csr', 'csc', 'coo'],
            dtype=[np.float64, np.float32]
        )
        good_shape_for_daal = \
            True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False

        _patching_status = PatchingConditionsChain(
            "sklearn.linear_model.Lasso.predict")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (good_shape_for_daal,
                "The shape of X does not satisfy oneDAL requirements: "
                "number of features > number of samples.")])
        _patching_status.write_log()

        if not _dal_ready:
            return self._decision_function(X)
        return _daal4py_predict_lasso(self, X)
예제 #29
0
class RandomForestRegressor(RandomForestRegressor_original):
    __doc__ = RandomForestRegressor_original.__doc__

    if sklearn_check_version('1.0'):

        def __init__(self,
                     n_estimators=100,
                     *,
                     criterion="squared_error",
                     max_depth=None,
                     min_samples_split=2,
                     min_samples_leaf=1,
                     min_weight_fraction_leaf=0.,
                     max_features="auto",
                     max_leaf_nodes=None,
                     min_impurity_decrease=0.,
                     bootstrap=True,
                     oob_score=False,
                     n_jobs=None,
                     random_state=None,
                     verbose=0,
                     warm_start=False,
                     ccp_alpha=0.0,
                     max_samples=None,
                     maxBins=256,
                     minBinSize=1):
            super(RandomForestRegressor, self).__init__(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_features=max_features,
                max_leaf_nodes=max_leaf_nodes,
                min_impurity_decrease=min_impurity_decrease,
                bootstrap=bootstrap,
                oob_score=oob_score,
                n_jobs=n_jobs,
                random_state=random_state,
                verbose=verbose,
                warm_start=warm_start)
            self.ccp_alpha = ccp_alpha
            self.max_samples = max_samples
            self.maxBins = maxBins
            self.minBinSize = minBinSize
            self.min_impurity_split = None
    else:

        def __init__(self,
                     n_estimators=100,
                     *,
                     criterion="mse",
                     max_depth=None,
                     min_samples_split=2,
                     min_samples_leaf=1,
                     min_weight_fraction_leaf=0.,
                     max_features="auto",
                     max_leaf_nodes=None,
                     min_impurity_decrease=0.,
                     min_impurity_split=None,
                     bootstrap=True,
                     oob_score=False,
                     n_jobs=None,
                     random_state=None,
                     verbose=0,
                     warm_start=False,
                     ccp_alpha=0.0,
                     max_samples=None,
                     maxBins=256,
                     minBinSize=1):
            super(RandomForestRegressor, self).__init__(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                max_features=max_features,
                max_leaf_nodes=max_leaf_nodes,
                min_impurity_decrease=min_impurity_decrease,
                min_impurity_split=min_impurity_split,
                bootstrap=bootstrap,
                oob_score=oob_score,
                n_jobs=n_jobs,
                random_state=random_state,
                verbose=verbose,
                warm_start=warm_start,
                ccp_alpha=ccp_alpha,
                max_samples=max_samples)
            self.maxBins = maxBins
            self.minBinSize = minBinSize

    def fit(self, X, y, sample_weight=None):
        """
        Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
        """
        return _fit_regressor(self, X, y, sample_weight=sample_weight)

    def predict(self, X):
        """
        Predict class for X.

        The predicted class of an input sample is a vote by the trees in
        the forest, weighted by their probability estimates. That is,
        the predicted class is the one with highest mean probability
        estimate across the trees.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
            The predicted classes.
        """
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=[np.float64, np.float32])

        if not hasattr(self, 'daal_model_') or \
                sp.issparse(X) or self.n_outputs_ != 1:
            logging.info("sklearn.ensemble.RandomForestRegressor."
                         "predict: " + get_patch_message("sklearn"))
            return super(RandomForestRegressor, self).predict(X)
        logging.info("sklearn.ensemble.RandomForestRegressor."
                     "predict: " + get_patch_message("daal"))
        return _daal_predict_regressor(self, X)

    if sklearn_check_version('1.0'):

        @deprecated(
            "Attribute `n_features_` was deprecated in version 1.0 and will be "
            "removed in 1.2. Use `n_features_in_` instead.")
        @property
        def n_features_(self):
            return self.n_features_in_

    @property
    def _estimators_(self):
        if hasattr(self, '_cached_estimators_'):
            if self._cached_estimators_:
                return self._cached_estimators_
        if LooseVersion(sklearn_version) >= LooseVersion("0.22"):
            check_is_fitted(self)
        else:
            check_is_fitted(self, 'daal_model_')
        # convert model to estimators
        if sklearn_check_version('1.0'):
            est = DecisionTreeRegressor(
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                random_state=None)
        else:
            est = DecisionTreeRegressor(
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                min_impurity_split=self.min_impurity_split,
                random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R)
        # oneAPI Data Analytics Library solution
        estimators_ = []
        random_state_checked = check_random_state(self.random_state)
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.set_params(random_state=random_state_checked.randint(
                np.iinfo(np.int32).max))
            if sklearn_check_version('1.0'):
                est_i.n_features_in_ = self.n_features_in_
            else:
                est_i.n_features_ = self.n_features_in_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(self.daal_model_, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_in_, np.array([1],
                                                             dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        return estimators_
예제 #30
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)
    daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    features_per_node_ = _to_absolute_max_features(self.max_features,
                                                   X.shape[1],
                                                   is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine_,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self