示例#1
0
def main(readcsv=read_csv, method='defaultDense'):
    nFeatures = 3
    nClasses = 5
    maxIterations = 200
    minObservationsInLeafNode = 8
    # input data file
    infile = "./data/batch/df_classification_train.csv"
    testfile = "./data/batch/df_classification_test.csv"

    # Configure a training object (5 classes)
    # previous version has different interface
    from daal4py import __daal_link_version__ as dv
    daal_version = tuple(map(int, (dv[0:4], dv[4:8])))
    if daal_version < (2020, 0):
        train_algo = d4p.gbt_classification_training(
            nClasses=nClasses,
            maxIterations=maxIterations,
            minObservationsInLeafNode=minObservationsInLeafNode,
            featuresPerNode=nFeatures)
    else:
        train_algo = d4p.gbt_classification_training(
            nClasses=nClasses,
            maxIterations=maxIterations,
            minObservationsInLeafNode=minObservationsInLeafNode,
            featuresPerNode=nFeatures,
            varImportance='weight|totalCover|cover|totalGain|gain')

    # Read data. Let's use 3 features per observation
    data = readcsv(infile, range(3), t=np.float32)
    labels = readcsv(infile, range(3, 4), t=np.float32)
    train_result = train_algo.compute(data, labels)

    # Now let's do some prediction
    # previous version has different interface
    if daal_version < (2020, 0):
        predict_algo = d4p.gbt_classification_prediction(nClasses=nClasses)
    else:
        predict_algo = d4p.gbt_classification_prediction(
            nClasses=nClasses,
            resultsToEvaluate="computeClassLabels|computeClassProbabilities")
    # read test data (with same #features)
    pdata = readcsv(testfile, range(3), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    plabels = readcsv(testfile, range(3, 4), t=np.float32)
    assert np.count_nonzero(predict_result.prediction -
                            plabels) / pdata.shape[0] < 0.022

    return (train_result, predict_result, plabels)
示例#2
0
    def fit(self, X, y):
        # Check the algorithm parameters
        self._check_params()

        # Check that X and y have correct shape
        X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double])

        check_classification_targets(y)

        # Encode labels
        le = preprocessing.LabelEncoder()
        le.fit(y)
        self.classes_ = le.classes_
        y_ = le.transform(y)

        # Convert to 2d array
        y_ = y_.reshape((-1, 1))

        self.n_outputs_ = y_.shape[1]

        self.n_classes_ = len(self.classes_)

        self.n_features_ = X.shape[1]

        # Classifier can't train when only one class is present.
        # Trivial case
        if self.n_classes_ == 1:
            return self

        # Get random seed
        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        # Define type of data
        fptype = getFPType(X)

        # Fit the model
        train_algo = d4p.gbt_classification_training(
            fptype=fptype,
            nClasses=self.n_classes_,
            splitMethod=self.split_method,
            maxIterations=self.max_iterations,
            maxTreeDepth=self.max_tree_depth,
            shrinkage=self.shrinkage,
            minSplitLoss=self.min_split_loss,
            lambda_=self.reg_lambda,
            observationsPerTreeFraction=self.observations_per_tree_fraction,
            featuresPerNode=self.features_per_node,
            minObservationsInLeafNode=self.min_observations_in_leaf_node,
            memorySavingMode=self.memory_saving_mode,
            maxBins=self.max_bins,
            minBinSize=self.min_bin_size,
            engine=d4p.engines_mcg59(seed=seed_))
        train_result = train_algo.compute(X, y_)

        # Store the model
        self.daal_model_ = train_result.model

        # Return the classifier
        return self
示例#3
0
def main():
    nFeatures = 3
    nClasses = 5
    maxIterations = 40
    minObservationsInLeafNode = 8
    # input data file
    infile = "./data/batch/df_classification_train.csv"
    testfile = "./data/batch/df_classification_test.csv"

    # Configure a training object (5 classes)
    train_algo = d4p.gbt_classification_training(
        nClasses=nClasses,
        maxIterations=maxIterations,
        minObservationsInLeafNode=minObservationsInLeafNode,
        featuresPerNode=nFeatures)

    # Read data. Let's use 3 features per observation
    data = read_csv(infile, range(3))
    labels = read_csv(infile, range(3, 4))
    train_result = train_algo.compute(data, labels)

    # Now let's do some prediction
    predict_algo = d4p.gbt_classification_prediction(5)
    # read test data (with same #features)
    pdata = read_csv(testfile, range(3))
    plabels = read_csv(testfile, range(3, 4))
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    assert (predict_result.prediction.shape == (pdata.shape[0], 1))

    return (train_result, predict_result, plabels)
示例#4
0
def main(readcsv=read_csv, method='defaultDense'):
    nFeatures = 3
    nClasses = 5
    maxIterations = 200
    minObservationsInLeafNode = 8
    # input data file
    infile = "./data/batch/df_classification_train.csv"
    testfile = "./data/batch/df_classification_test.csv"

    # Configure a training object (5 classes)
    train_algo = d4p.gbt_classification_training(
        nClasses=nClasses,
        maxIterations=maxIterations,
        minObservationsInLeafNode=minObservationsInLeafNode,
        featuresPerNode=nFeatures)

    # Read data. Let's use 3 features per observation
    data = readcsv(infile, range(3), t=np.float32)
    labels = readcsv(infile, range(3, 4), t=np.float32)
    train_result = train_algo.compute(data, labels)

    # Now let's do some prediction
    predict_algo = d4p.gbt_classification_prediction(5)
    # read test data (with same #features)
    pdata = readcsv(testfile, range(3), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    plabels = readcsv(testfile, range(3, 4), t=np.float32)
    assert np.count_nonzero(predict_result.prediction -
                            plabels) / pdata.shape[0] < 0.022

    return (train_result, predict_result, plabels)