def main(readcsv=read_csv, method='defaultDense'): nFeatures = 3 nClasses = 5 maxIterations = 200 minObservationsInLeafNode = 8 # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) # previous version has different interface from daal4py import __daal_link_version__ as dv daal_version = tuple(map(int, (dv[0:4], dv[4:8]))) if daal_version < (2020, 0): train_algo = d4p.gbt_classification_training( nClasses=nClasses, maxIterations=maxIterations, minObservationsInLeafNode=minObservationsInLeafNode, featuresPerNode=nFeatures) else: train_algo = d4p.gbt_classification_training( nClasses=nClasses, maxIterations=maxIterations, minObservationsInLeafNode=minObservationsInLeafNode, featuresPerNode=nFeatures, varImportance='weight|totalCover|cover|totalGain|gain') # Read data. Let's use 3 features per observation data = readcsv(infile, range(3), t=np.float32) labels = readcsv(infile, range(3, 4), t=np.float32) train_result = train_algo.compute(data, labels) # Now let's do some prediction # previous version has different interface if daal_version < (2020, 0): predict_algo = d4p.gbt_classification_prediction(nClasses=nClasses) else: predict_algo = d4p.gbt_classification_prediction( nClasses=nClasses, resultsToEvaluate="computeClassLabels|computeClassProbabilities") # read test data (with same #features) pdata = readcsv(testfile, range(3), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction plabels = readcsv(testfile, range(3, 4), t=np.float32) assert np.count_nonzero(predict_result.prediction - plabels) / pdata.shape[0] < 0.022 return (train_result, predict_result, plabels)
def fit(self, X, y): # Check the algorithm parameters self._check_params() # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double]) check_classification_targets(y) # Encode labels le = preprocessing.LabelEncoder() le.fit(y) self.classes_ = le.classes_ y_ = le.transform(y) # Convert to 2d array y_ = y_.reshape((-1, 1)) self.n_outputs_ = y_.shape[1] self.n_classes_ = len(self.classes_) self.n_features_ = X.shape[1] # Classifier can't train when only one class is present. # Trivial case if self.n_classes_ == 1: return self # Get random seed rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) # Define type of data fptype = getFPType(X) # Fit the model train_algo = d4p.gbt_classification_training( fptype=fptype, nClasses=self.n_classes_, splitMethod=self.split_method, maxIterations=self.max_iterations, maxTreeDepth=self.max_tree_depth, shrinkage=self.shrinkage, minSplitLoss=self.min_split_loss, lambda_=self.reg_lambda, observationsPerTreeFraction=self.observations_per_tree_fraction, featuresPerNode=self.features_per_node, minObservationsInLeafNode=self.min_observations_in_leaf_node, memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, engine=d4p.engines_mcg59(seed=seed_)) train_result = train_algo.compute(X, y_) # Store the model self.daal_model_ = train_result.model # Return the classifier return self
def main(): nFeatures = 3 nClasses = 5 maxIterations = 40 minObservationsInLeafNode = 8 # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) train_algo = d4p.gbt_classification_training( nClasses=nClasses, maxIterations=maxIterations, minObservationsInLeafNode=minObservationsInLeafNode, featuresPerNode=nFeatures) # Read data. Let's use 3 features per observation data = read_csv(infile, range(3)) labels = read_csv(infile, range(3, 4)) train_result = train_algo.compute(data, labels) # Now let's do some prediction predict_algo = d4p.gbt_classification_prediction(5) # read test data (with same #features) pdata = read_csv(testfile, range(3)) plabels = read_csv(testfile, range(3, 4)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction assert (predict_result.prediction.shape == (pdata.shape[0], 1)) return (train_result, predict_result, plabels)
def main(readcsv=read_csv, method='defaultDense'): nFeatures = 3 nClasses = 5 maxIterations = 200 minObservationsInLeafNode = 8 # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) train_algo = d4p.gbt_classification_training( nClasses=nClasses, maxIterations=maxIterations, minObservationsInLeafNode=minObservationsInLeafNode, featuresPerNode=nFeatures) # Read data. Let's use 3 features per observation data = readcsv(infile, range(3), t=np.float32) labels = readcsv(infile, range(3, 4), t=np.float32) train_result = train_algo.compute(data, labels) # Now let's do some prediction predict_algo = d4p.gbt_classification_prediction(5) # read test data (with same #features) pdata = readcsv(testfile, range(3), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction plabels = readcsv(testfile, range(3, 4), t=np.float32) assert np.count_nonzero(predict_result.prediction - plabels) / pdata.shape[0] < 0.022 return (train_result, predict_result, plabels)