예제 #1
0
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters,
                                        cluster_centers_0, verbose,
                                        random_state):
    def is_string(s, target_str):
        return isinstance(s, str) and s == target_str

    is_sparse = sp.isspmatrix(X)

    deterministic = False
    if is_string(cluster_centers_0, 'k-means++'):
        _seed = random_state.randint(np.iinfo('i').max)
        plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense"
        daal_engine = daal4py.engines_mt19937(fptype=X_fptype,
                                              method="defaultDense",
                                              seed=_seed)
        _n_local_trials = 2 + int(np.log(nClusters))
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          nTrials=_n_local_trials,
                                          method=plus_plus_method,
                                          engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif is_string(cluster_centers_0, 'random'):
        _seed = random_state.randint(np.iinfo('i').max)
        random_method = "randomCSR" if is_sparse else "randomDense"
        daal_engine = daal4py.engines_mt19937(seed=_seed,
                                              fptype=X_fptype,
                                              method="defaultDense")
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          method=random_method,
                                          engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif hasattr(cluster_centers_0, '__array__'):
        deterministic = True
        cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif callable(cluster_centers_0):
        cc_arr = cluster_centers_0(X, nClusters, random_state)
        cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif is_string(cluster_centers_0, 'deterministic'):
        deterministic = True
        default_method = "lloydCSR" if is_sparse else "defaultDense"
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          method=default_method)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    else:
        raise ValueError(
            f"init should be either 'k-means++', 'random', a ndarray or a "
            f"callable, got '{cluster_centers_0}' instead.")
    if verbose:
        print("Initialization complete")
    return deterministic, centroids_
예제 #2
0
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters,
                                        cluster_centers_0, random_state):
    def is_string(s, target_str):
        return isinstance(s, string_types) and s == target_str

    deterministic = False
    if is_string(cluster_centers_0, 'k-means++'):
        _seed = random_state.randint(np.iinfo('i').max)
        daal_engine = daal4py.engines_mt19937(fptype=X_fptype,
                                              method='defaultDense',
                                              seed=_seed)
        _n_local_trials = 2 + int(np.log(nClusters))
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          nTrials=_n_local_trials,
                                          method='plusPlusDense',
                                          engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif is_string(cluster_centers_0, 'random'):
        _seed = random_state.randint(np.iinfo('i').max)
        daal_engine = daal4py.engines_mt19937(seed=_seed,
                                              fptype=X_fptype,
                                              method='defaultDense')
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          method='randomDense',
                                          engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif hasattr(cluster_centers_0, '__array__'):
        deterministic = True
        cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif callable(cluster_centers_0):
        cc_arr = cluster_centers_0(X, nClusters, random_state)
        cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif is_string(cluster_centers_0, 'deterministic'):
        deterministic = True
        kmeans_init = daal4py.kmeans_init(nClusters,
                                          fptype=X_fptype,
                                          method='defaultDense')
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    else:
        raise ValueError(
            "Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array"
        )
    return deterministic, centroids_
def compute(train_data, train_labels, predict_data, method='defaultDense'):
    # Configure a training object (5 classes)
    train_algo = d4p.decision_forest_classification_training(
        5,
        fptype='float',
        nTrees=10,
        minObservationsInLeafNode=8,
        featuresPerNode=3,
        engine=d4p.engines_mt19937(seed=777),
        varImportance='MDI',
        bootstrap=True,
        resultsToCompute='computeOutOfBagError',
        method=method)
    # Training result provides (depending on parameters) model,
    # outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_classification_prediction(
        nClasses=5,
        fptype='float',
        resultsToEvaluate="computeClassLabels|computeClassProbabilities",
        votingMethod="unweighted")

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result
def main():
    # input data file
    infile = "./data/batch/df_classification_train.csv"
    testfile = "./data/batch/df_classification_test.csv"

    # Configure a training object (5 classes)
    train_algo = d4p.decision_forest_classification_training(
        5,
        nTrees=10,
        minObservationsInLeafNode=8,
        featuresPerNode=3,
        engine=d4p.engines_mt19937(seed=777),
        varImportance='MDI',
        bootstrap=True,
        resultsToCompute='computeOutOfBagError')

    # Read data. Let's use 3 features per observation
    data = read_csv(infile, range(3), t=np.float32)
    labels = read_csv(infile, range(3, 4), t=np.float32)
    train_result = train_algo.compute(data, labels)
    # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance

    # Now let's do some prediction
    predict_algo = d4p.decision_forest_classification_prediction(5)
    # read test data (with same #features)
    pdata = read_csv(testfile, range(3), t=np.float32)
    plabels = read_csv(testfile, range(3, 4), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    assert (predict_result.prediction.shape == (pdata.shape[0], 1))

    return (train_result, predict_result, plabels)
예제 #5
0
def main(readcsv=None, method='defaultDense'):

    # Create algorithm
    algorithm = d4p.distributions_normal(engine=d4p.engines_mt19937(seed=777))

    # Create array and fill with normal distribution
    data = np.zeros((1,10))
    res = algorithm.compute(data)

    assert(np.allclose(data, res.randomNumbers))
    assert(np.allclose(data, [[-0.74104167,-0.13616829,-0.13679562,2.40385531,-0.33556821,0.19041699,-0.61331181, 0.95958821,-0.42301092,0.09460208]]))

    return data
예제 #6
0
def main(readcsv=None, method='defaultDense'):

    # Create algorithm
    algorithm = d4p.distributions_bernoulli(0.5, engine=d4p.engines_mt19937(seed=777))

    # Create array and fill with bernoulli distribution
    data = np.zeros((1,10))
    res = algorithm.compute(data)

    assert(np.allclose(data, res.randomNumbers))
    assert(np.allclose(data, [[1.0,1.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000,0.000]]))

    return data
def main(readcsv=None, method='defaultDense'):

    # Create algorithm
    algorithm = d4p.distributions_uniform(engine=d4p.engines_mt19937(seed=777))

    # Create array and fill with bernoulli distribution
    data = np.zeros((1, 10))
    res = algorithm.compute(data)

    assert (np.allclose(data, res.randomNumbers))
    assert (np.allclose(data, [[
        0.22933409, 0.44584412, 0.44559617, 0.9918884, 0.36859825, 0.57550881,
        0.26983509, 0.83136875, 0.33614365, 0.53768455
    ]]))

    return data
예제 #8
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    rs_ = check_random_state(self.random_state)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                 X.shape[1],
                                                 is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if sample_weight is not None:
        sample_weight = [sample_weight]

    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype=getFPType(X),
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap
        if self.bootstrap is True else 1.,
        featuresPerNode=int(_featuresPerNode),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)

    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
예제 #9
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    features_per_node_ = _to_absolute_max_features(
        self.max_features, X.shape[1], is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0],
        max_samples=self.max_samples
    )

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf
                                   if isinstance(
                                       self.min_samples_leaf, numbers.Integral)
                                   else int(ceil(
                                       self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(
            0.0 if self.min_impurity_split is None else self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute=(
            "computeOutOfBagErrorAccuracy|computeOutOfBagErrorDecisionFunction"
            if self.oob_score
            else ""),
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split
                                    if isinstance(
                                        self.min_samples_split, numbers.Integral)
                                    else int(ceil(
                                        self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize
    )
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    if self.oob_score:
        self.oob_score_ = dfc_trainingResult.outOfBagErrorAccuracy[0][0]
        self.oob_decision_function_ = dfc_trainingResult.outOfBagErrorDecisionFunction
        if self.oob_decision_function_.shape[-1] == 1:
            self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)

    return self