def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/df_regression_train.csv" testfile = "./data/batch/df_regression_test.csv" # Configure a Linear regression training object train_algo = d4p.decision_forest_regression_training( nTrees=100, varImportance='MDA_Raw', bootstrap=True, engine=d4p.engines_mt2203(seed=777), resultsToCompute= 'computeOutOfBagError|computeOutOfBagErrorPerObservation') # Read data. Let's have 13 independent, and 1 dependent variables (for each observation) indep_data = readcsv(infile, range(13), t=np.float32) dep_data = readcsv(infile, range(13, 14), t=np.float32) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance # Now let's do some prediction predict_algo = d4p.decision_forest_regression_prediction() # read test data (with same #features) pdata = readcsv(testfile, range(13), t=np.float32) ptdata = readcsv(testfile, range(13, 14), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # The prediction result provides prediction assert predict_result.prediction.shape == (pdata.shape[0], dep_data.shape[1]) return (train_result, predict_result, ptdata)
def run_inference(num_observations: int = 1000): """Run xgboost for specified number of observations""" # Load data train_x_df = common.get_test_data_df(X=common.X_df, size=num_observations) train_y_df = common.get_test_data_df(X=common.y_df, size=num_observations) num_rows = len(train_x_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() MODEL = d4p.decision_forest_regression_training(nTrees=100) train_result = MODEL.compute(train_x_df, train_y_df) end_time = timer() total_time = end_time - start_time run_times.append(total_time * 10e3) inference_time = total_time * (10e6) / num_rows inference_times.append(inference_time) return_elem = common.calculate_stats(inference_times) print(num_observations, ", ", return_elem) return return_elem
def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0, max_depth=0, min_impurity=0, bootstrap=True): fptype = getFPType(X) features_per_node = X.shape[1] if n_features_per_node > 0 and n_features_per_node <= features_per_node: features_per_node = n_features_per_node engine = engines_mt2203(seed=seed, fptype=fptype) algorithm = decision_forest_regression_training( fptype=fptype, method='defaultDense', nTrees=n_trees, observationsPerTreeFraction=1., featuresPerNode=features_per_node, maxTreeDepth=max_depth, minObservationsInLeafNode=1, engine=engine, impurityThreshold=min_impurity, varImportance='MDI', resultsToCompute='', memorySavingMode=False, bootstrap=bootstrap) df_regr_result = algorithm.compute(X, y) return df_regr_result
def _daal_fit_regressor(self, X, y, sample_weight=None): self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) if sample_weight is not None: sample_weight = [sample_weight] # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype = getFPType(X), method = 'defaultDense', nTrees = int(self.n_estimators), observationsPerTreeFraction = n_samples_bootstrap if self.bootstrap is True else 1., featuresPerNode = int(_featuresPerNode), maxTreeDepth = int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode = (self.min_samples_leaf if isinstance(self.min_samples_leaf, numbers.Integral) else int(ceil(self.min_samples_leaf * X.shape[0]))), engine = daal_engine, impurityThreshold = float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance = "MDI", resultsToCompute = "", memorySavingMode = False, bootstrap = bool(self.bootstrap), minObservationsInSplitNode = (self.min_samples_split if isinstance(self.min_samples_split, numbers.Integral) else int(ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode = self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode = self.min_impurity_decrease, maxLeafNodes = 0 if self.max_leaf_nodes is None else self.max_leaf_nodes ) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self.estimators_ = self._estimators_ self._set_oob_score(X, y) return self
def compute(train_data, train_labels, predict_data, method='defaultDense'): # Configure a training object train_algo = d4p.decision_forest_regression_training(nTrees=100, engine = d4p.engines_mt2203(seed=777), varImportance='MDA_Raw', bootstrap=True, resultsToCompute='computeOutOfBagError|computeOutOfBagErrorPerObservation', method=method ) # Training result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance train_result = train_algo.compute(train_data, train_labels) # now predict using the model from the training above predict_algo = d4p.decision_forest_regression_prediction() predict_result = predict_algo.compute(predict_data, train_result.model) return train_result, predict_result
def compute(train_data, train_labels, predict_data): # Configure a training object train_algo = d4p.decision_forest_regression_training( method='hist', maxBins=256, minBinSize=1, nTrees=100, fptype='float', varImportance='MDA_Raw', bootstrap=True, engine=d4p.engines_mt2203(seed=777), resultsToCompute= 'computeOutOfBagError|computeOutOfBagErrorPerObservation') # Training result provides (depending on parameters) model, # outOfBagError, outOfBagErrorPerObservation and/or variableImportance train_result = train_algo.compute(train_data, train_labels) # now predict using the model from the training above predict_algo = d4p.decision_forest_regression_prediction(fptype='float') predict_result = predict_algo.compute(predict_data, train_result.model) return train_result, predict_result
def _daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.double, np.single] X = check_array(X, dtype=_supported_dtypes_) y = np.asarray(y) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype = getFPType(X), method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap) ) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
def _daal_fit_regressor(self, X, y, sample_weight=None): self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) # limitation on the number of stream for mt2203 is 6024 # more details here: # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html max_stream_count = 6024 if self.n_estimators <= max_stream_count: daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) else: daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if sample_weight is not None: sample_weight = [sample_weight] # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype=getFPType(X), method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap if self.bootstrap is True else 1., featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ #if self.oob_score: # self.estimators_ = self._estimators_ # self._set_oob_score(X, y) return self
def daal_fit(self, X, y): self._check_daal_supported_parameters() _supported_dtypes_ = [np.double, np.single] X = check_array(X, dtype=_supported_dtypes_) y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warnings.warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] self.n_features_ = X.shape[1] rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype=getFPType(X), method='defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=1, featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=1, engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap)) dfr_trainingResult = dfr_algorithm.compute(X, y) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # convert model to estimators est = DecisionTreeRegressor( criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, random_state=None) # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution estimators_ = [] for i in range(self.n_estimators): est_i = clone(est) est_i.n_features_ = self.n_features_ est_i.n_outputs_ = self.n_outputs_ tree_i_state_class = daal4py.getTreeState(model, i) tree_i_state_dict = { 'max_depth': tree_i_state_class.max_depth, 'node_count': tree_i_state_class.node_count, 'nodes': tree_i_state_class.node_ar, 'values': tree_i_state_class.value_ar } est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp), self.n_outputs_) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self.estimators_ = estimators_ # compute oob_score_ if self.oob_score: self._set_oob_score(X, y) return self
from timeit import default_timer as timer #import xgboost as xgb from sklearn.metrics import mean_squared_error import daal4py as d4p import numpy as np import pandas as pd import common d4p.daalinit() NUM_LOOPS = 100 print("Computing for Random Forest") MODEL = d4p.decision_forest_regression_training(nTrees=100) train_result = MODEL.compute(common.X_df, common.y_df) def run_inference(num_observations:int = 1000): """Run xgboost for specified number of observations""" # Load data test_df = common.get_test_data_df(X=common.X_df,size = num_observations) num_rows = len(test_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() predict_algo = d4p.decision_forest_regression_prediction(fptype='float') predict_result = predict_algo.compute(test_df, train_result.model)