Exemplo n.º 1
0
def test_drf_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestClassifier

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True, n_estimators=10, n_gpus=n_gpus())
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from h2o4gpu.ensemble import RandomForestClassifierSklearn
    drf_sk = RandomForestClassifierSklearn(random_state=1234, oob_score=True, max_depth=3, n_estimators=10)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)).all() == True
        assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_classes_")
        print(drf.n_classes_)
        print(drf_sk.n_classes_)
        assert drf.n_classes_ == drf_sk.n_classes_

        print("n_features")
        print(drf.classes_)
        print(drf_sk.classes_)
        assert (drf.classes_ == drf_sk.classes_).all() == True

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_
Exemplo n.º 2
0
class RandomForestClassifier(object):
    """H2O RandomForestClassifier Solver

    Selects between h2o4gpu.solvers.xgboost.RandomForestClassifier
    and h2o4gpu.ensemble.forest.RandomForestClassifierSklearn
    Documentation:
    import h2o4gpu.solvers ; help(h2o4gpu.xgboost.RandomForestClassifierO)
    help(h2o4gpu.ensemble.forest.RandomForestClassifierSklearn)

    :param: backend : Which backend to use.  Options are 'auto', 'sklearn',
        'h2o4gpu'.  Default is 'auto'.
        Saves as attribute for actual backend used.

    """
    def __init__(
            self,
            n_estimators=10,  # h2o4gpu
            criterion='gini',
            max_depth=3,  # h2o4gpu
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_features='auto',
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            bootstrap=True,
            oob_score=False,
            n_jobs=1,  # h2o4gpu
            random_state=None,  # h2o4gpu
            verbose=0,  # h2o4gpu
            warm_start=False,
            class_weight=None,
            # XGBoost specific params
            subsample=1.0,  # h2o4gpu
            colsample_bytree=1.0,  # h2o4gpu
            num_parallel_tree=100,  # h2o4gpu
            tree_method='gpu_hist',  # h2o4gpu
            n_gpus=-1,  # h2o4gpu
            predictor='gpu_predictor',  # h2o4gpu
            backend='auto'):  # h2o4gpu
        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend
        from ..typecheck.typechecks import assert_is_type
        assert_is_type(backend, str)

        # Fall back to Sklearn
        # Can remove if fully implement sklearn functionality
        self.do_sklearn = False
        if backend == 'auto':

            params_string = [
                'criterion', 'min_samples_split', 'min_samples_leaf',
                'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes',
                'min_impurity_decrease', 'min_impurity_split', 'bootstrap',
                'oob_score', 'class_weight'
            ]
            params = [
                criterion, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, max_features, max_leaf_nodes,
                min_impurity_decrease, min_impurity_split, bootstrap,
                oob_score, class_weight
            ]
            params_default = [
                'gini', 2, 1, 0.0, 'auto', None, 0.0, None, True, False, None
            ]

            i = 0
            for param in params:
                if param != params_default[i]:
                    self.do_sklearn = True
                    if verbose > 0:
                        print("WARNING: The sklearn parameter " +
                              params_string[i] +
                              " has been changed from default to " +
                              str(param) +
                              ". Will run Sklearn RandomForestsClassifier.")
                    self.do_sklearn = True
                i = i + 1
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        self.backend = backend

        from h2o4gpu.ensemble import RandomForestClassifierSklearn
        self.model_sklearn = RandomForestClassifierSklearn(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            min_impurity_split=min_impurity_split,
            bootstrap=bootstrap,
            oob_score=oob_score,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
            class_weight=class_weight)

        # Parameters for random forest
        silent = False
        if verbose != 0:
            silent = True
        if random_state is None:
            random_state = 0

        import xgboost as xgb
        self.model_h2o4gpu = xgb.XGBClassifier(
            n_estimators=n_estimators,  # h2o4gpu
            max_depth=max_depth,  # h2o4gpu
            n_jobs=n_jobs,  # h2o4gpu
            random_state=random_state,  # h2o4gpu
            num_parallel_tree=num_parallel_tree,
            tree_method=tree_method,
            n_gpus=n_gpus,
            predictor=predictor,
            silent=silent,
            num_round=1,
            subsample=subsample,
            colsample_bytree=colsample_bytree)

        if self.do_sklearn:
            print("Running sklearn RandomForestClassifier")
            self.model = self.model_sklearn
        else:
            print("Running h2o4gpu RandomForestClassifier")
            self.model = self.model_h2o4gpu

    def apply(self, X):
        print("WARNING: apply() is using sklearn")
        return self.model_sklearn.apply(X)

    def decision_path(self, X):
        print("WARNING: decision_path() is using sklearn")
        return self.model_sklearn.decision_path(X)

    def fit(self, X, y=None, sample_weight=None):
        res = self.model.fit(X, y, sample_weight)
        self.set_attributes()
        return res

    def get_params(self):
        return self.model.get_params()

    def predict(self, X):
        if self.do_sklearn:
            res = self.model.predict(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        res[res < 0.5] = 0
        res[res > 0.5] = 1
        self.set_attributes()
        return res.squeeze()

    def predict_log_proba(self, X):
        res = self.predict_proba(X)
        self.set_attributes()
        import numpy as np
        return np.log(res)

    def predict_proba(self, X):
        if self.do_sklearn:
            res = self.model.predict_proba(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        self.set_attributes()
        return res

    def score(self, X, y, sample_weight=None):
        # TODO add for h2o4gpu
        print("WARNING: score() is using sklearn")
        if not self.do_sklearn:
            self.model_sklearn.fit(X, y)  # Need to re-fit
        res = self.model_sklearn.score(X, y, sample_weight)
        return res

    def set_params(self, **params):
        return self.model.set_params(**params)

    def set_attributes(self):
        """ Set attributes for class"""
        from ..solvers.utils import _setter
        s = _setter(oself=self, e1=NameError, e2=AttributeError)

        s('oself.estimators_ = oself.model.estimators_')
        s('oself.classes_ = oself.model.classes_')
        s('oself.n_classes_ = oself.model.n_classes_')
        s('oself.n_features_ = oself.model.n_features_')
        s('oself.n_outputs_ = oself.model.n_outputs_')
        s('oself.feature_importances_ = oself.model.feature_importances_')
        s('oself.oob_score_ = oself.model.oob_score_')
        s('oself.oob_decision_function_ = oself.model.oob_decision_function_')
Exemplo n.º 3
0
class RandomForestClassifier(object):
    """H2O RandomForestClassifier Solver

    Selects between h2o4gpu.solvers.xgboost.RandomForestClassifier
    and h2o4gpu.ensemble.forest.RandomForestClassifierSklearn
    Documentation:
    import h2o4gpu.solvers ; help(h2o4gpu.xgboost.RandomForestClassifierO)
    help(h2o4gpu.ensemble.forest.RandomForestClassifierSklearn)

    :param: backend : Which backend to use.  Options are 'auto', 'sklearn',
        'h2o4gpu'.  Default is 'auto'.
        Saves as attribute for actual backend used.

    """

    def __init__(
            self,
            n_estimators=10,  # h2o4gpu
            criterion='gini',
            max_depth=3,  # h2o4gpu
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_features='auto',
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_impurity_split=None,
            bootstrap=True,
            oob_score=False,
            n_jobs=1,  # h2o4gpu
            random_state=None,  # h2o4gpu
            verbose=0,  # h2o4gpu
            warm_start=False,
            class_weight=None,
            # XGBoost specific params
            subsample=1.0,  # h2o4gpu
            colsample_bytree=1.0,  # h2o4gpu
            num_parallel_tree=100,  # h2o4gpu
            tree_method='gpu_hist',  # h2o4gpu
            n_gpus=-1,  # h2o4gpu
            predictor='gpu_predictor',  # h2o4gpu
            backend='auto'):  # h2o4gpu
        import os
        _backend = os.environ.get('H2O4GPU_BACKEND', None)
        if _backend is not None:
            backend = _backend
        from ..typecheck.typechecks import assert_is_type
        assert_is_type(backend, str)

        # Fall back to Sklearn
        # Can remove if fully implement sklearn functionality
        self.do_sklearn = False
        if backend == 'auto':

            params_string = [
                'criterion', 'min_samples_split', 'min_samples_leaf',
                'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes',
                'min_impurity_decrease', 'min_impurity_split', 'bootstrap',
                'oob_score', 'class_weight'
            ]
            params = [
                criterion, min_samples_split, min_samples_leaf,
                min_weight_fraction_leaf, max_features, max_leaf_nodes,
                min_impurity_decrease, min_impurity_split, bootstrap, oob_score,
                class_weight
            ]
            params_default = [
                'gini', 2, 1, 0.0, 'auto', None, 0.0, None, True, False, None
            ]

            i = 0
            for param in params:
                if param != params_default[i]:
                    self.do_sklearn = True
                    if verbose > 0:
                        print("WARNING: The sklearn parameter " +
                              params_string[i] +
                              " has been changed from default to " + str(param)
                              + ". Will run Sklearn RandomForestsClassifier.")
                    self.do_sklearn = True
                i = i + 1
        elif backend == 'sklearn':
            self.do_sklearn = True
        elif backend == 'h2o4gpu':
            self.do_sklearn = False
        self.backend = backend

        from h2o4gpu.ensemble import RandomForestClassifierSklearn
        self.model_sklearn = RandomForestClassifierSklearn(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            min_impurity_split=min_impurity_split,
            bootstrap=bootstrap,
            oob_score=oob_score,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
            class_weight=class_weight)

        # Parameters for random forest
        silent = False
        if verbose != 0:
            silent = True
        if random_state is None:
            random_state = 0

        import xgboost as xgb
        self.model_h2o4gpu = xgb.XGBClassifier(
            n_estimators=n_estimators,  # h2o4gpu
            max_depth=max_depth,  # h2o4gpu
            n_jobs=n_jobs,  # h2o4gpu
            random_state=random_state,  # h2o4gpu
            num_parallel_tree=num_parallel_tree,
            tree_method=tree_method,
            n_gpus=n_gpus,
            predictor=predictor,
            silent=silent,
            num_round=1,
            subsample=subsample,
            colsample_bytree=colsample_bytree)

        if self.do_sklearn:
            print("Running sklearn RandomForestClassifier")
            self.model = self.model_sklearn
        else:
            print("Running h2o4gpu RandomForestClassifier")
            self.model = self.model_h2o4gpu

    def apply(self, X):
        print("WARNING: apply() is using sklearn")
        return self.model_sklearn.apply(X)

    def decision_path(self, X):
        print("WARNING: decision_path() is using sklearn")
        return self.model_sklearn.decision_path(X)

    def fit(self, X, y=None, sample_weight=None):
        res = self.model.fit(X, y, sample_weight)
        self.set_attributes()
        return res

    def get_params(self):
        return self.model.get_params()

    def predict(self, X):
        if self.do_sklearn:
            res = self.model.predict(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        res[res < 0.5] = 0
        res[res > 0.5] = 1
        self.set_attributes()
        return res.squeeze()

    def predict_log_proba(self, X):
        res = self.predict_proba(X)
        self.set_attributes()
        import numpy as np
        return np.log(res)

    def predict_proba(self, X):
        if self.do_sklearn:
            res = self.model.predict_proba(X)
            self.set_attributes()
            return res
        res = self.model.predict(X)
        self.set_attributes()
        return res

    def score(self, X, y, sample_weight=None):
        # TODO add for h2o4gpu
        print("WARNING: score() is using sklearn")
        if not self.do_sklearn:
            self.model_sklearn.fit(X, y)  # Need to re-fit
        res = self.model_sklearn.score(X, y, sample_weight)
        return res

    def set_params(self, **params):
        return self.model.set_params(**params)

    def set_attributes(self):
        """ Set attributes for class"""
        from ..solvers.utils import _setter
        s = _setter(oself=self, e1=NameError, e2=AttributeError)

        s('oself.estimators_ = oself.model.estimators_')
        s('oself.classes_ = oself.model.classes_')
        s('oself.n_classes_ = oself.model.n_classes_')
        s('oself.n_features_ = oself.model.n_features_')
        s('oself.n_outputs_ = oself.model.n_outputs_')
        s('oself.feature_importances_ = oself.model.feature_importances_')
        s('oself.oob_score_ = oself.model.oob_score_')
        s('oself.oob_decision_function_ = oself.model.oob_decision_function_')