def compute(train_data, train_labels, predict_data, method='defaultDense'): # Configure a training object (5 classes) train_algo = d4p.decision_forest_classification_training( 5, fptype='float', nTrees=10, minObservationsInLeafNode=8, featuresPerNode=3, engine=d4p.engines_mt19937(seed=777), varImportance='MDI', bootstrap=True, resultsToCompute='computeOutOfBagError', method=method) # Training result provides (depending on parameters) model, # outOfBagError, outOfBagErrorPerObservation and/or variableImportance train_result = train_algo.compute(train_data, train_labels) # now predict using the model from the training above predict_algo = d4p.decision_forest_classification_prediction( nClasses=5, fptype='float', resultsToEvaluate="computeClassLabels|computeClassProbabilities", votingMethod="unweighted") predict_result = predict_algo.compute(predict_data, train_result.model) return train_result, predict_result
def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345, n_features_per_node=0, max_depth=0, min_impurity=0, bootstrap=True, verbose=False): fptype = getFPType(X) features_per_node = X.shape[1] if n_features_per_node > 0 and n_features_per_node < features_per_node: features_per_node = n_features_per_node engine = engines_mt2203(seed=seed, fptype=fptype) algorithm = decision_forest_classification_training( nClasses=n_classes, fptype=fptype, method='defaultDense', nTrees=n_trees, observationsPerTreeFraction=1., featuresPerNode=features_per_node, maxTreeDepth=max_depth, minObservationsInLeafNode=1, engine=engine, impurityThreshold=min_impurity, varImportance='MDI', resultsToCompute='', memorySavingMode=False, bootstrap=bootstrap ) df_clsf_result = algorithm.compute(X, y) return df_clsf_result
def main(): # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) train_algo = d4p.decision_forest_classification_training( 5, nTrees=10, minObservationsInLeafNode=8, featuresPerNode=3, engine=d4p.engines_mt19937(seed=777), varImportance='MDI', bootstrap=True, resultsToCompute='computeOutOfBagError') # Read data. Let's use 3 features per observation data = read_csv(infile, range(3), t=np.float32) labels = read_csv(infile, range(3, 4), t=np.float32) train_result = train_algo.compute(data, labels) # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance # Now let's do some prediction predict_algo = d4p.decision_forest_classification_prediction(5) # read test data (with same #features) pdata = read_csv(testfile, range(3), t=np.float32) plabels = read_csv(testfile, range(3, 4), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction assert (predict_result.prediction.shape == (pdata.shape[0], 1)) return (train_result, predict_result, plabels)
def _daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.single, np.double] X = check_array(X, dtype=_supported_dtypes_) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ raise ValueError(_class_name + " does not currently support multi-output data. Consider using OneHotEncoder") y = check_array(y, ensure_2d=False, dtype=None) y, _ = self._validate_y_class_weight(y) self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if self.n_classes_ < 2: raise ValueError("Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True) dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(self.n_classes_), fptype=X_fptype, method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=int(self.min_samples_leaf), engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap) ) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
def _daal_fit_classifier(self, X, y, sample_weight=None): y = check_array(y, ensure_2d=False, dtype=None) y, expanded_class_weight = self._validate_y_class_weight(y) n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if sample_weight is not None: sample_weight = [sample_weight] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) # limitation on the number of stream for mt2203 is 6024 # more details here: # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html max_stream_count = 6024 if self.n_estimators <= max_stream_count: daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) else: daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype) features_per_node_ = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True) n_samples_bootstrap_ = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(n_classes_), fptype=X_fptype, method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap_ if self.bootstrap is True else 1., featuresPerNode=int(features_per_node_), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # compute oob_score_ #if self.oob_score: # self.estimators_ = self._estimators_ # self._set_oob_score(X, y) return self
def daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.single, np.double] X = check_array(X, dtype=_supported_dtypes_) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ raise ValueError( _class_name + " does not currently support multi-output data. Consider using OneHotEncoder" ) y = check_array(y, ensure_2d=False, dtype=None) y, _ = self._validate_y_class_weight(y) self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if self.n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(self.n_classes_), fptype=X_fptype, method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine_, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap)) # compute dfc_trainingResult = dfc_algorithm.compute(X, y) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model # convert model to estimators est = DecisionTreeClassifier( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=None) # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution estimators_ = [] for i in range(self.n_estimators): # print("Tree #{}".format(i)) est_i = clone(est) est_i.n_features_ = self.n_features_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = self.classes_ est_i.n_classes_ = self.n_classes_ # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar' tree_i_state_class = daal4py.getTreeState(model, i, self.n_classes_) node_ndarray = tree_i_state_class.node_ar value_ndarray = tree_i_state_class.value_ar value_shape = (node_ndarray.shape[0], self.n_outputs_, self.n_classes_) # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer" tree_i_state_dict = { 'max_depth': tree_i_state_class.max_depth, 'node_count': tree_i_state_class.node_count, 'nodes': tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } # est_i.tree_ = Tree(self.n_features_, np.array([self.n_classes_], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self.estimators_ = estimators_ # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self