Python decision_forest_classification_training 예제들, daal4py.decision_forest_classification_training Python 예제들

예제 #1

0

파일 보기

파일: decision_forest_classification_batch.py 프로젝트: owerbat/scikit-learn-intelex

def compute(train_data, train_labels, predict_data, method='defaultDense'):
    # Configure a training object (5 classes)
    train_algo = d4p.decision_forest_classification_training(
        5,
        fptype='float',
        nTrees=10,
        minObservationsInLeafNode=8,
        featuresPerNode=3,
        engine=d4p.engines_mt19937(seed=777),
        varImportance='MDI',
        bootstrap=True,
        resultsToCompute='computeOutOfBagError',
        method=method)
    # Training result provides (depending on parameters) model,
    # outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_classification_prediction(
        nClasses=5,
        fptype='float',
        resultsToEvaluate="computeClassLabels|computeClassProbabilities",
        votingMethod="unweighted")

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result

예제 #2

0

파일 보기

파일: df_clsf.py 프로젝트: mbatoul/scikit-learn_bench

def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345,
                n_features_per_node=0, max_depth=0, min_impurity=0,
                bootstrap=True, verbose=False):

    fptype = getFPType(X)

    features_per_node = X.shape[1]
    if n_features_per_node > 0 and n_features_per_node < features_per_node:
        features_per_node = n_features_per_node

    engine = engines_mt2203(seed=seed, fptype=fptype)

    algorithm = decision_forest_classification_training(
        nClasses=n_classes,
        fptype=fptype,
        method='defaultDense',
        nTrees=n_trees,
        observationsPerTreeFraction=1.,
        featuresPerNode=features_per_node,
        maxTreeDepth=max_depth,
        minObservationsInLeafNode=1,
        engine=engine,
        impurityThreshold=min_impurity,
        varImportance='MDI',
        resultsToCompute='',
        memorySavingMode=False,
        bootstrap=bootstrap
    )

    df_clsf_result = algorithm.compute(X, y)

    return df_clsf_result

예제 #3

0

파일 보기

파일: decision_forest_classification_batch.py 프로젝트: SSITB/daal4py

def main():
    # input data file
    infile = "./data/batch/df_classification_train.csv"
    testfile = "./data/batch/df_classification_test.csv"

    # Configure a training object (5 classes)
    train_algo = d4p.decision_forest_classification_training(
        5,
        nTrees=10,
        minObservationsInLeafNode=8,
        featuresPerNode=3,
        engine=d4p.engines_mt19937(seed=777),
        varImportance='MDI',
        bootstrap=True,
        resultsToCompute='computeOutOfBagError')

    # Read data. Let's use 3 features per observation
    data = read_csv(infile, range(3), t=np.float32)
    labels = read_csv(infile, range(3, 4), t=np.float32)
    train_result = train_algo.compute(data, labels)
    # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance

    # Now let's do some prediction
    predict_algo = d4p.decision_forest_classification_prediction(5)
    # read test data (with same #features)
    pdata = read_csv(testfile, range(3), t=np.float32)
    plabels = read_csv(testfile, range(3, 4), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    assert (predict_result.prediction.shape == (pdata.shape[0], 1))

    return (train_result, predict_result, plabels)

예제 #4

0

파일 보기

파일: decision_forest.py 프로젝트: zimaxeg/daal4py

    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(_class_name + " does not currently support multi-output data. Consider using OneHotEncoder")

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError("Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=int(self.min_samples_leaf),
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )
        self._cached_estimators_ = None
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self

예제 #5

0

파일 보기

def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    features_per_node_ = _to_absolute_max_features(self.max_features,
                                                   X.shape[1],
                                                   is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self

예제 #6

0

파일 보기

    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(
                _class_name +
                " does not currently support multi-output data. Consider using OneHotEncoder"
            )

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError(
                "Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = self.classes_
            est_i.n_classes_ = self.n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(model, i,
                                                      self.n_classes_)

            node_ndarray = tree_i_state_class.node_ar
            value_ndarray = tree_i_state_class.value_ar
            value_shape = (node_ndarray.shape[0], self.n_outputs_,
                           self.n_classes_)

            # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer"

            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            #
            est_i.tree_ = Tree(self.n_features_,
                               np.array([self.n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self