def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, verbose, random_state): def is_string(s, target_str): return isinstance(s, str) and s == target_str is_sparse = sp.isspmatrix(X) deterministic = False if is_string(cluster_centers_0, 'k-means++'): _seed = random_state.randint(np.iinfo('i').max) plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense" daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method="defaultDense", seed=_seed) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, nTrials=_n_local_trials, method=plus_plus_method, engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) random_method = "randomCSR" if is_sparse else "randomDense" daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method="defaultDense") kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=random_method, engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): deterministic = True default_method = "lloydCSR" if is_sparse else "defaultDense" kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=default_method) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " f"callable, got '{cluster_centers_0}' instead.") if verbose: print("Initialization complete") return deterministic, centroids_
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state): def is_string(s, target_str): return isinstance(s, string_types) and s == target_str deterministic = False if is_string(cluster_centers_0, 'k-means++'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method='defaultDense', seed=_seed) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, nTrials=_n_local_trials, method='plusPlusDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method='defaultDense') kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='randomDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): deterministic = True kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='defaultDense') kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( "Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array" ) return deterministic, centroids_
def compute(train_data, train_labels, predict_data, method='defaultDense'): # Configure a training object (5 classes) train_algo = d4p.decision_forest_classification_training( 5, fptype='float', nTrees=10, minObservationsInLeafNode=8, featuresPerNode=3, engine=d4p.engines_mt19937(seed=777), varImportance='MDI', bootstrap=True, resultsToCompute='computeOutOfBagError', method=method) # Training result provides (depending on parameters) model, # outOfBagError, outOfBagErrorPerObservation and/or variableImportance train_result = train_algo.compute(train_data, train_labels) # now predict using the model from the training above predict_algo = d4p.decision_forest_classification_prediction( nClasses=5, fptype='float', resultsToEvaluate="computeClassLabels|computeClassProbabilities", votingMethod="unweighted") predict_result = predict_algo.compute(predict_data, train_result.model) return train_result, predict_result
def main(): # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) train_algo = d4p.decision_forest_classification_training( 5, nTrees=10, minObservationsInLeafNode=8, featuresPerNode=3, engine=d4p.engines_mt19937(seed=777), varImportance='MDI', bootstrap=True, resultsToCompute='computeOutOfBagError') # Read data. Let's use 3 features per observation data = read_csv(infile, range(3), t=np.float32) labels = read_csv(infile, range(3, 4), t=np.float32) train_result = train_algo.compute(data, labels) # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance # Now let's do some prediction predict_algo = d4p.decision_forest_classification_prediction(5) # read test data (with same #features) pdata = read_csv(testfile, range(3), t=np.float32) plabels = read_csv(testfile, range(3, 4), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction assert (predict_result.prediction.shape == (pdata.shape[0], 1)) return (train_result, predict_result, plabels)
def main(readcsv=None, method='defaultDense'): # Create algorithm algorithm = d4p.distributions_normal(engine=d4p.engines_mt19937(seed=777)) # Create array and fill with normal distribution data = np.zeros((1,10)) res = algorithm.compute(data) assert(np.allclose(data, res.randomNumbers)) assert(np.allclose(data, [[-0.74104167,-0.13616829,-0.13679562,2.40385531,-0.33556821,0.19041699,-0.61331181, 0.95958821,-0.42301092,0.09460208]])) return data
def main(readcsv=None, method='defaultDense'): # Create algorithm algorithm = d4p.distributions_bernoulli(0.5, engine=d4p.engines_mt19937(seed=777)) # Create array and fill with bernoulli distribution data = np.zeros((1,10)) res = algorithm.compute(data) assert(np.allclose(data, res.randomNumbers)) assert(np.allclose(data, [[1.0,1.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000,0.000]])) return data
def main(readcsv=None, method='defaultDense'): # Create algorithm algorithm = d4p.distributions_uniform(engine=d4p.engines_mt19937(seed=777)) # Create array and fill with bernoulli distribution data = np.zeros((1, 10)) res = algorithm.compute(data) assert (np.allclose(data, res.randomNumbers)) assert (np.allclose(data, [[ 0.22933409, 0.44584412, 0.44559617, 0.9918884, 0.36859825, 0.57550881, 0.26983509, 0.83136875, 0.33614365, 0.53768455 ]])) return data
def _daal_fit_regressor(self, X, y, sample_weight=None): self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") X_fptype = getFPType(X) seed_ = rs_.randint(0, np.iinfo('i').max) # limitation on the number of stream for mt2203 is 6024 # more details here: # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html max_stream_count = 6024 if self.n_estimators <= max_stream_count: daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) else: daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype) _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) if sample_weight is not None: sample_weight = [sample_weight] # create algorithm dfr_algorithm = daal4py.decision_forest_regression_training( fptype=getFPType(X), method='hist' if daal_check_version( (2021, 'P', 200)) else 'defaultDense', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap if self.bootstrap is True else 1., featuresPerNode=int(_featuresPerNode), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int( ceil(self.min_samples_leaf * X.shape[0]))), engine=daal_engine, impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute="", memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int( ceil(self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize) self._cached_estimators_ = None dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight) # get resulting model model = dfr_trainingResult.model self.daal_model_ = model # compute oob_score_ #if self.oob_score: # self.estimators_ = self._estimators_ # self._set_oob_score(X, y) return self
def _daal_fit_classifier(self, X, y, sample_weight=None): y = check_array(y, ensure_2d=False, dtype=None) y, expanded_class_weight = self._validate_y_class_weight(y) n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if sample_weight is not None: sample_weight = [sample_weight] rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) if n_classes_ < 2: raise ValueError( "Training data only contain information about one class.") # create algorithm X_fptype = getFPType(X) # limitation on the number of stream for mt2203 is 6024 # more details here: # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html max_stream_count = 6024 if self.n_estimators <= max_stream_count: daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype) else: daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype) features_per_node_ = _to_absolute_max_features( self.max_features, X.shape[1], is_classification=True) n_samples_bootstrap_ = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") dfc_algorithm = daal4py.decision_forest_classification_training( nClasses=int(n_classes_), fptype=X_fptype, method='hist', nTrees=int(self.n_estimators), observationsPerTreeFraction=n_samples_bootstrap_ if self.bootstrap is True else 1., featuresPerNode=int(features_per_node_), maxTreeDepth=int(0 if self.max_depth is None else self.max_depth), minObservationsInLeafNode=(self.min_samples_leaf if isinstance( self.min_samples_leaf, numbers.Integral) else int(ceil( self.min_samples_leaf * X.shape[0]))), engine=daal_engine, impurityThreshold=float( 0.0 if self.min_impurity_split is None else self.min_impurity_split), varImportance="MDI", resultsToCompute=( "computeOutOfBagErrorAccuracy|computeOutOfBagErrorDecisionFunction" if self.oob_score else ""), memorySavingMode=False, bootstrap=bool(self.bootstrap), minObservationsInSplitNode=(self.min_samples_split if isinstance( self.min_samples_split, numbers.Integral) else int(ceil( self.min_samples_split * X.shape[0]))), minWeightFractionInLeafNode=self.min_weight_fraction_leaf, minImpurityDecreaseInSplitNode=self.min_impurity_decrease, maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes, maxBins=self.maxBins, minBinSize=self.minBinSize ) self._cached_estimators_ = None # compute dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight) # get resulting model model = dfc_trainingResult.model self.daal_model_ = model if self.oob_score: self.oob_score_ = dfc_trainingResult.outOfBagErrorAccuracy[0][0] self.oob_decision_function_ = dfc_trainingResult.outOfBagErrorDecisionFunction if self.oob_decision_function_.shape[-1] == 1: self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1) return self