class ExtraTreesRegressor( IterativeComponent, AutoSklearnRegressionAlgorithm, ): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, max_leaf_nodes, max_depth, min_impurity_decrease, oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = n_estimators self.estimator_increment = 10 self.criterion = criterion self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.max_features = max_features self.min_impurity_decrease = min_impurity_decrease self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.estimator = None def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesRegressor as ETR if refit: self.estimator = None if self.estimator is None: self.n_estimators = int(self.n_estimators) if self.criterion not in ("mse", "friedman_mse", "mae"): raise ValueError( "'criterion' is not in ('mse', 'friedman_mse', " "'mae): %s" % self.criterion) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.min_impurity_decrease = float(self.min_impurity_decrease) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) self.estimator = ETR( n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit( X, y, ) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'handles_incremental_learning': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) criterion = CategoricalHyperparameter("criterion", ['mse', 'friedman_mse', 'mae']) max_features = UniformFloatHyperparameter("max_features", 0.1, 1.0, default_value=1) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter("min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default_value=1) min_impurity_decrease = UnParametrizedHyperparameter( 'min_impurity_decrease', 0.0) bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"], default_value="False") cs.add_hyperparameters([ n_estimators, criterion, max_features, max_depth, max_leaf_nodes, min_samples_split, min_samples_leaf, min_impurity_decrease, bootstrap ]) return cs
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", bootstrap=False, max_leaf_nodes=None, max_depth="None", oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("mse"): raise ValueError("'criterion' is not in ('mse'): " "%s" % criterion) self.criterion = criterion if max_leaf_nodes_or_max_depth == "max_depth": self.max_leaf_nodes = None if max_depth == "None": self.max_depth = None else: self.max_depth = int(max_depth) #if use_max_depth == "True": # self.max_depth = int(max_depth) #elif use_max_depth == "False": # self.max_depth = None else: if max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.max_depth = None self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) if bootstrap == "True": self.bootstrap = True elif bootstrap == "False": self.bootstrap = False self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.estimator = None def fit(self, X, y, refit=False): if self.estimator is None or refit: self.iterative_fit(X, y, n_iter=1, refit=refit) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETR( n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True ) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit(X, y,) self.estimator = tmp return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': False, # TODO find out if this is good because of sparcity... 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! # But rather fortran or C-contiguous? 'preferred_dtype': np.float32} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) criterion = cs.add_hyperparameter(Constant("criterion", "mse")) max_features = cs.add_hyperparameter(UniformFloatHyperparameter( "max_features", 0.5, 5, default=1)) max_depth = cs.add_hyperparameter( UnParametrizedHyperparameter(name="max_depth", value="None")) min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_split", 2, 20, default=2)) min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter( "min_samples_leaf", 1, 20, default=1)) # Unparametrized, we use min_samples as regularization # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter( # name="max_leaf_nodes_or_max_depth", value="max_depth") # CategoricalHyperparameter("max_leaf_nodes_or_max_depth", # choices=["max_leaf_nodes", "max_depth"], default="max_depth") # min_weight_fraction_leaf = UniformFloatHyperparameter( # "min_weight_fraction_leaf", 0.0, 0.1) # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", # value="None") bootstrap = cs.add_hyperparameter(CategoricalHyperparameter( "bootstrap", ["True", "False"], default="False")) # Conditions # Not applicable because max_leaf_nodes is no legal value of the parent #cond_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=max_leaf_nodes, # parent=max_leaf_nodes_or_max_depth, # value="max_leaf_nodes") #cond2_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=use_max_depth, # parent=max_leaf_nodes_or_max_depth, # value="max_depth") #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth, #value="True") #cs.add_condition(cond_max_leaf_nodes_or_max_depth) #cs.add_condition(cond2_max_leaf_nodes_or_max_depth) #cs.add_condition(cond_max_depth) return cs
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm): def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", bootstrap=False, max_leaf_nodes=None, max_depth="None", oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = int(n_estimators) self.estimator_increment = 10 if criterion not in ("mse"): raise ValueError("'criterion' is not in ('mse'): " "%s" % criterion) self.criterion = criterion if max_leaf_nodes_or_max_depth == "max_depth": self.max_leaf_nodes = None if max_depth == "None": self.max_depth = None else: self.max_depth = int(max_depth) #if use_max_depth == "True": # self.max_depth = int(max_depth) #elif use_max_depth == "False": # self.max_depth = None else: if max_leaf_nodes == "None": self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.max_depth = None self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) if bootstrap == "True": self.bootstrap = True elif bootstrap == "False": self.bootstrap = False self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.estimator = None def fit(self, X, y, refit=False): if self.estimator is None or refit: self.iterative_fit(X, y, n_iter=1, refit=refit) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.estimator = ETR(n_estimators=0, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, warm_start=True) tmp = self.estimator # TODO copy ? tmp.n_estimators += n_iter tmp.fit( X, y, ) self.estimator = tmp return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'ET', 'name': 'Extra Trees Regressor', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': False, # TODO find out if this is good because of sparcity... 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ), # TODO find out what is best used here! # But rather fortran or C-contiguous? 'preferred_dtype': np.float32 } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) criterion = cs.add_hyperparameter(Constant("criterion", "mse")) max_features = cs.add_hyperparameter( UniformFloatHyperparameter("max_features", 0.5, 5, default=1)) max_depth = cs.add_hyperparameter( UnParametrizedHyperparameter(name="max_depth", value="None")) min_samples_split = cs.add_hyperparameter( UniformIntegerHyperparameter("min_samples_split", 2, 20, default=2)) min_samples_leaf = cs.add_hyperparameter( UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default=1)) # Unparametrized, we use min_samples as regularization # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter( # name="max_leaf_nodes_or_max_depth", value="max_depth") # CategoricalHyperparameter("max_leaf_nodes_or_max_depth", # choices=["max_leaf_nodes", "max_depth"], default="max_depth") # min_weight_fraction_leaf = UniformFloatHyperparameter( # "min_weight_fraction_leaf", 0.0, 0.1) # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", # value="None") bootstrap = cs.add_hyperparameter( CategoricalHyperparameter("bootstrap", ["True", "False"], default="False")) # Conditions # Not applicable because max_leaf_nodes is no legal value of the parent #cond_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=max_leaf_nodes, # parent=max_leaf_nodes_or_max_depth, # value="max_leaf_nodes") #cond2_max_leaf_nodes_or_max_depth = \ # EqualsCondition(child=use_max_depth, # parent=max_leaf_nodes_or_max_depth, # value="max_depth") #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth, #value="True") #cs.add_condition(cond_max_leaf_nodes_or_max_depth) #cs.add_condition(cond2_max_leaf_nodes_or_max_depth) #cs.add_condition(cond_max_depth) return cs
# rf = RandomForestClassifier() # # Random search of parameters, using 3 fold cross validation, # # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, verbose=2, random_state=42, n_jobs = -1) # # Fit the random search model # rf_random.fit(local_train, y_local_train) # print(rf_random.best_params_) # In[ ]: #RF classifier for train-validation perf: clf = ExtraTreesRegressor(verbose=2, n_jobs=1,oob_score=True,min_samples_leaf=2, bootstrap=True,criterion='mae', max_depth = 30, n_estimators=200, random_state=0) clf.fit(local_train, y_local_train) p = clf.predict_proba(local_validation) y_validation_pred_binary = clf.predict(local_validation) y_validation_pred_prob = [] for x,y in p: y_validation_pred_prob.append(y) count_match = 0 count_error = 0 deviation = 0.0 assert(len(y_validation_pred_prob)==len(y_local_validation)) validation_gtruth=np.asarray(y_local_validation) for i in range(len(y_local_validation)): deviation +=abs(y_validation_pred_prob[i]-validation_gtruth[i]) if (int(y_validation_pred_binary[i])==int(validation_gtruth[i])): count_match+=1 else: count_error+=1