Пример #1
0
class ExtraTreesRegressor(
        IterativeComponent,
        AutoSklearnRegressionAlgorithm,
):
    def __init__(self,
                 n_estimators,
                 criterion,
                 min_samples_leaf,
                 min_samples_split,
                 max_features,
                 bootstrap,
                 max_leaf_nodes,
                 max_depth,
                 min_impurity_decrease,
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):

        self.n_estimators = n_estimators
        self.estimator_increment = 10
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_leaf_nodes = max_leaf_nodes
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.min_impurity_decrease = min_impurity_decrease
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.estimator = None

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.n_estimators = int(self.n_estimators)
            if self.criterion not in ("mse", "friedman_mse", "mae"):
                raise ValueError(
                    "'criterion' is not in ('mse', 'friedman_mse', "
                    "'mae): %s" % self.criterion)

            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_samples_split = int(self.min_samples_split)
            self.max_features = float(self.max_features)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.bootstrap = check_for_bool(self.bootstrap)
            self.n_jobs = int(self.n_jobs)
            self.verbose = int(self.verbose)

            self.estimator = ETR(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True)
        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(
            X,
            y,
        )

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'ET',
            'name': 'Extra Trees Regressor',
            'handles_regression': True,
            'handles_classification': False,
            'handles_multiclass': False,
            'handles_multilabel': False,
            'handles_incremental_learning': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = Constant("n_estimators", 100)
        criterion = CategoricalHyperparameter("criterion",
                                              ['mse', 'friedman_mse', 'mae'])
        max_features = UniformFloatHyperparameter("max_features",
                                                  0.1,
                                                  1.0,
                                                  default_value=1)

        max_depth = UnParametrizedHyperparameter(name="max_depth",
                                                 value="None")
        max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None")

        min_samples_split = UniformIntegerHyperparameter("min_samples_split",
                                                         2,
                                                         20,
                                                         default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf",
                                                        1,
                                                        20,
                                                        default_value=1)
        min_impurity_decrease = UnParametrizedHyperparameter(
            'min_impurity_decrease', 0.0)

        bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"],
                                              default_value="False")

        cs.add_hyperparameters([
            n_estimators, criterion, max_features, max_depth, max_leaf_nodes,
            min_samples_split, min_samples_leaf, min_impurity_decrease,
            bootstrap
        ])

        return cs
Пример #2
0
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm):
    def __init__(self, n_estimators, criterion, min_samples_leaf,
                 min_samples_split, max_features,
                 max_leaf_nodes_or_max_depth="max_depth",
                 bootstrap=False, max_leaf_nodes=None, max_depth="None",
                 oob_score=False, n_jobs=1, random_state=None, verbose=0):

        self.n_estimators = int(n_estimators)
        self.estimator_increment = 10
        if criterion not in ("mse"):
            raise ValueError("'criterion' is not in ('mse'): "
                             "%s" % criterion)
        self.criterion = criterion

        if max_leaf_nodes_or_max_depth == "max_depth":
            self.max_leaf_nodes = None
            if max_depth == "None":
                self.max_depth = None
            else:
                self.max_depth = int(max_depth)
                #if use_max_depth == "True":
                #    self.max_depth = int(max_depth)
                #elif use_max_depth == "False":
                #    self.max_depth = None
        else:
            if max_leaf_nodes == "None":
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(max_leaf_nodes)
            self.max_depth = None

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)

        self.max_features = float(max_features)

        if bootstrap == "True":
            self.bootstrap = True
        elif bootstrap == "False":
            self.bootstrap = False

        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.estimator = None

    def fit(self, X, y, refit=False):
        if self.estimator is None or refit:
            self.iterative_fit(X, y, n_iter=1, refit=refit)

        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)
        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(
                n_estimators=0, criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score, n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True
            )
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(X, y,)
        self.estimator = tmp
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'ET',
                'name': 'Extra Trees Regressor',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': False,
                # TODO find out if this is good because of sparcity...
                'prefers_data_normalized': False,
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                # But rather fortran or C-contiguous?
                'preferred_dtype': np.float32}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100))
        criterion = cs.add_hyperparameter(Constant("criterion", "mse"))
        max_features = cs.add_hyperparameter(UniformFloatHyperparameter(
            "max_features", 0.5, 5, default=1))

        max_depth = cs.add_hyperparameter(
            UnParametrizedHyperparameter(name="max_depth", value="None"))

        min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_split", 2, 20, default=2))
        min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_leaf", 1, 20, default=1))

        # Unparametrized, we use min_samples as regularization
        # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter(
        # name="max_leaf_nodes_or_max_depth", value="max_depth")
        # CategoricalHyperparameter("max_leaf_nodes_or_max_depth",
        # choices=["max_leaf_nodes", "max_depth"], default="max_depth")
        # min_weight_fraction_leaf = UniformFloatHyperparameter(
        #    "min_weight_fraction_leaf", 0.0, 0.1)
        # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
        #                                              value="None")

        bootstrap = cs.add_hyperparameter(CategoricalHyperparameter(
            "bootstrap", ["True", "False"], default="False"))

        # Conditions
        # Not applicable because max_leaf_nodes is no legal value of the parent
        #cond_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=max_leaf_nodes,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_leaf_nodes")
        #cond2_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=use_max_depth,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_depth")

        #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth,
        #value="True")
        #cs.add_condition(cond_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond2_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond_max_depth)

        return cs
Пример #3
0
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm):
    def __init__(self,
                 n_estimators,
                 criterion,
                 min_samples_leaf,
                 min_samples_split,
                 max_features,
                 max_leaf_nodes_or_max_depth="max_depth",
                 bootstrap=False,
                 max_leaf_nodes=None,
                 max_depth="None",
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):

        self.n_estimators = int(n_estimators)
        self.estimator_increment = 10
        if criterion not in ("mse"):
            raise ValueError("'criterion' is not in ('mse'): "
                             "%s" % criterion)
        self.criterion = criterion

        if max_leaf_nodes_or_max_depth == "max_depth":
            self.max_leaf_nodes = None
            if max_depth == "None":
                self.max_depth = None
            else:
                self.max_depth = int(max_depth)
                #if use_max_depth == "True":
                #    self.max_depth = int(max_depth)
                #elif use_max_depth == "False":
                #    self.max_depth = None
        else:
            if max_leaf_nodes == "None":
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(max_leaf_nodes)
            self.max_depth = None

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)

        self.max_features = float(max_features)

        if bootstrap == "True":
            self.bootstrap = True
        elif bootstrap == "False":
            self.bootstrap = False

        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.estimator = None

    def fit(self, X, y, refit=False):
        if self.estimator is None or refit:
            self.iterative_fit(X, y, n_iter=1, refit=refit)

        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)
        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(n_estimators=0,
                                 criterion=self.criterion,
                                 max_depth=self.max_depth,
                                 min_samples_split=self.min_samples_split,
                                 min_samples_leaf=self.min_samples_leaf,
                                 bootstrap=self.bootstrap,
                                 max_features=max_features,
                                 max_leaf_nodes=self.max_leaf_nodes,
                                 oob_score=self.oob_score,
                                 n_jobs=self.n_jobs,
                                 verbose=self.verbose,
                                 random_state=self.random_state,
                                 warm_start=True)
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(
            X,
            y,
        )
        self.estimator = tmp
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'ET',
            'name': 'Extra Trees Regressor',
            'handles_missing_values': False,
            'handles_nominal_values': False,
            'handles_numerical_features': True,
            'prefers_data_scaled': False,
            # TODO find out if this is good because of sparcity...
            'prefers_data_normalized': False,
            'handles_regression': True,
            'handles_classification': False,
            'handles_multiclass': False,
            'handles_multilabel': False,
            'is_deterministic': True,
            'handles_sparse': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, ),
            # TODO find out what is best used here!
            # But rather fortran or C-contiguous?
            'preferred_dtype': np.float32
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100))
        criterion = cs.add_hyperparameter(Constant("criterion", "mse"))
        max_features = cs.add_hyperparameter(
            UniformFloatHyperparameter("max_features", 0.5, 5, default=1))

        max_depth = cs.add_hyperparameter(
            UnParametrizedHyperparameter(name="max_depth", value="None"))

        min_samples_split = cs.add_hyperparameter(
            UniformIntegerHyperparameter("min_samples_split", 2, 20,
                                         default=2))
        min_samples_leaf = cs.add_hyperparameter(
            UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default=1))

        # Unparametrized, we use min_samples as regularization
        # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter(
        # name="max_leaf_nodes_or_max_depth", value="max_depth")
        # CategoricalHyperparameter("max_leaf_nodes_or_max_depth",
        # choices=["max_leaf_nodes", "max_depth"], default="max_depth")
        # min_weight_fraction_leaf = UniformFloatHyperparameter(
        #    "min_weight_fraction_leaf", 0.0, 0.1)
        # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
        #                                              value="None")

        bootstrap = cs.add_hyperparameter(
            CategoricalHyperparameter("bootstrap", ["True", "False"],
                                      default="False"))

        # Conditions
        # Not applicable because max_leaf_nodes is no legal value of the parent
        #cond_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=max_leaf_nodes,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_leaf_nodes")
        #cond2_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=use_max_depth,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_depth")

        #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth,
        #value="True")
        #cs.add_condition(cond_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond2_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond_max_depth)

        return cs
Пример #4
0
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(local_train, y_local_train)
# print(rf_random.best_params_)


# In[ ]:


#RF classifier for train-validation perf:
clf = ExtraTreesRegressor(verbose=2, n_jobs=1,oob_score=True,min_samples_leaf=2, bootstrap=True,criterion='mae', max_depth = 30, n_estimators=200, random_state=0)
clf.fit(local_train, y_local_train)
p = clf.predict_proba(local_validation)
y_validation_pred_binary = clf.predict(local_validation)
y_validation_pred_prob = []
for x,y in p:
    y_validation_pred_prob.append(y)
count_match = 0
count_error = 0
deviation = 0.0
assert(len(y_validation_pred_prob)==len(y_local_validation))
validation_gtruth=np.asarray(y_local_validation)
for i in range(len(y_local_validation)):
    deviation +=abs(y_validation_pred_prob[i]-validation_gtruth[i])
    if (int(y_validation_pred_binary[i])==int(validation_gtruth[i])):
        count_match+=1
    else:
        count_error+=1