def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger("HomogeneousRandomForestRegressionModel") self.logger = logger assert model_config in homogeneous_random_forest_config_store.parameter_space RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space, fit_state=HomogeneousRandomForestFitState( input_space=input_space, output_space=output_space)) self._input_space_adapter = HierarchicalToFlatHypergridAdapter( adaptee=self.input_space) self._output_space_adapter = HierarchicalToFlatHypergridAdapter( adaptee=self.output_space) self.target_dimension_names = [ dimension.name for dimension in self._output_space_adapter.dimensions ] assert len(self.target_dimension_names ) == 1, "Single target predictions for now." self._decision_trees = [] self._create_estimators()
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger( "RegressionEnhancedRandomForestRegressionModel") self.logger = logger assert model_config in RegressionEnhancedRandomForestRegressionModelConfig.CONFIG_SPACE RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self.model_config = model_config # one hot encode categorical input dimensions self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter( adaptee=input_space, merge_all_categorical_dimensions=True, drop='first') # Explode continuous dimensions to polynomial features up to model config specified monomial degree # am using include_bias to produce constant term (all 1s) column to simplify one hot encoding logic self.polynomial_features_adapter = ContinuousToPolynomialBasisHypergridAdapter( adaptee=self.one_hot_encoder_adapter.target, degree=self.model_config.max_basis_function_degree, include_bias=True, interaction_only=False) self.input_space = input_space self.input_dimension_names = [ dimension.name for dimension in self.input_space.dimensions ] self.output_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.base_regressor_ = None self.random_forest_regressor_ = None self.random_forest_kwargs = None self.root_model_kwargs = None self.detected_feature_indices_ = None self.screening_root_model_coef_ = None self.fit_X_ = None self.partial_hat_matrix_ = None self.base_regressor_standard_error_ = None self.dof_ = None self.variance_estimate_ = None self.root_model_gradient_coef_ = None self.polynomial_features_powers_ = None self.categorical_zero_cols_idx_to_delete_ = None self.scaler_ = StandardScaler() self._trained = False
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger("DecisionTreeRegressionModel") self.logger = logger assert model_config in decision_tree_config_store.parameter_space RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self._input_space_adapter = CategoricalToDiscreteHypergridAdapter( adaptee=self.input_space) self.input_dimension_names = [ dimension.name for dimension in self._input_space_adapter.dimensions ] self.target_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.logger.debug( f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}." ) assert len( self.target_dimension_names ) == 1, "For now (and perhaps forever) we only support single target per tree." self._regressor = DecisionTreeRegressor( criterion=self.model_config.criterion, splitter=self.model_config.splitter, max_depth=self.model_config.max_depth if self.model_config.max_depth != 0 else None, min_samples_split=self.model_config.min_samples_split, min_samples_leaf=self.model_config.min_samples_leaf, min_weight_fraction_leaf=self.model_config. min_weight_fraction_leaf, max_features=self.model_config.max_features, random_state=self.model_config.get("random_state", None), max_leaf_nodes=self.model_config.max_leaf_nodes if self.model_config.max_leaf_nodes not in (0, 1) else None, min_impurity_decrease=self.model_config.min_impurity_decrease, ccp_alpha=self.model_config.ccp_alpha) # These are used to compute the variance in predictions self._observations_per_leaf = dict() self._mean_per_leaf = dict() self._mean_variance_per_leaf = dict() self._sample_variance_per_leaf = dict() self._count_per_leaf = dict() self._trained = False
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger( "RegressionEnhancedRandomForestRegressionModel") self.logger = logger assert model_config in RegressionEnhancedRandomForestRegressionModelConfig.CONFIG_SPACE RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self.model_config = model_config # one hot encode categorical input dimensions self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter( adaptee=input_space, merge_all_categorical_dimensions=True, drop='first') self.input_space = input_space self.input_dimension_names = [ dimension.name for dimension in self.input_space.dimensions ] self.output_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.base_regressor_ = None self.random_forest_regressor_ = None # set up basis feature transform self.polynomial_features_transform_ = \ PolynomialFeatures(degree=self.model_config.max_basis_function_degree) self.random_forest_kwargs = None self.root_model_kwargs = None self.detected_feature_indices_ = None self.screening_root_model_coef_ = None self.fit_X_ = None self.partial_hat_matrix_ = None self.base_regressor_standard_error_ = None self.dof_ = None self.variance_estimate_ = None self.root_model_gradient_coef_ = None self.polynomial_features_powers_ = None self.categorical_zero_cols_idx_to_delete_ = None self.scaler_ = StandardScaler() self._trained = False
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger( "RegressionEnhancedRandomForestRegressionModel") self.logger = logger assert model_config in RegressionEnhancedRandomForestRegressionModelConfig.CONFIG_SPACE RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self.model_config = model_config self.input_dimension_names = [ dimension.name for dimension in self.input_space.dimensions ] self.output_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.base_regressor_ = None self.random_forest_regressor_ = None # set up basis feature transform self.polynomial_features_transform_ = \ PolynomialFeatures(degree=self.model_config.max_basis_function_degree) self.random_forest_kwargs = None self.root_model_kwargs = None self.detected_feature_indices_ = None self.screening_root_model_coef_ = None self.fit_X_ = None self.partial_hat_matrix_ = None self.base_regressor_standard_error_ = None self.dof_ = None self.variance_estimate_ = None self.root_model_gradient_coef_ = None self.polynomial_features_powers_ = None self.num_dummy_vars_ = None self.num_categorical_dims_ = None self.continuous_dim_col_names_ = None self.categorical_dim_col_names_ = None self.dummy_var_map_ = None self.dummy_var_cols_ = None self.categorical_zero_cols_idx_to_delete_ = None
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger("LassoRegressionModel") self.logger = logger assert model_config in lasso_cross_validated_config_store.parameter_space RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) # setup adapters to accommodate categorical dimensions # one hot encode categorical input dimensions self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter( adaptee=input_space, merge_all_categorical_dimensions=True, drop='first') self.input_dimension_names = [ dimension.name for dimension in self.one_hot_encoder_adapter.dimensions ] self.continuous_dimension_names = [ dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions if isinstance(dimension, ContinuousDimension) ] self.target_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.logger.debug( f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}." ) assert len( self.target_dimension_names ) == 1, "For now (and perhaps forever) we only support single target per Lasso model." self.lasso_model_kwargs = { 'eps': self.model_config.eps, 'n_alphas': self.model_config.num_alphas, 'alphas': None, 'fit_intercept': self.model_config.fit_intercept, 'normalize': self.model_config.normalize, 'precompute': self.model_config.precompute, 'max_iter': self.model_config.max_iter, 'tol': self.model_config.tol, 'copy_X': self.model_config.copy_x, 'cv': self.model_config.num_cross_validations, 'verbose': self.model_config.verbose, 'n_jobs': self.model_config.num_jobs, 'positive': self.model_config.positive, 'random_state': None, 'selection': self.model_config.selection } self._regressor = LassoCV(**self.lasso_model_kwargs) self._trained: bool = False self.categorical_zero_cols_idx_to_delete_ = None self.dof_ = 0 self.partial_hat_matrix_ = 0 self.regressor_standard_error_ = 0
def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger: logging.Logger = None): if logger is None: logger = create_logger("LassoRegressionModel") self.logger = logger assert model_config in lasso_cross_validated_config_store.parameter_space RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) # setup adapters to accommodate categorical dimensions # one hot encode categorical input dimensions self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter( adaptee=input_space, merge_all_categorical_dimensions=True, drop='first') self.input_dimension_names = self.input_space.dimension_names self._projected_input_dimension_names = [ dimension.name for dimension in self.one_hot_encoder_adapter.dimensions ] self.continuous_dimension_names = [ dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions if isinstance(dimension, ContinuousDimension) ] self.target_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.logger.debug( f"Input dimensions: {str(self._projected_input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}." ) assert len( self.target_dimension_names ) == 1, "For now (and perhaps forever) we only support single target per Lasso model." self.lasso_model_kwargs = { 'eps': self.model_config.eps, 'n_alphas': self.model_config.num_alphas, 'alphas': None, 'fit_intercept': self.model_config.fit_intercept, 'normalize': self.model_config.normalize, 'precompute': self.model_config.precompute, 'max_iter': self.model_config.max_iter, 'tol': self.model_config.tol, 'copy_X': self.model_config.copy_x, 'cv': self.model_config.num_cross_validations, 'verbose': self.model_config.verbose, 'n_jobs': self.model_config.num_jobs, 'positive': self.model_config.positive, 'random_state': None, 'selection': self.model_config.selection } self._regressor = LassoCV(**self.lasso_model_kwargs) self._trained: bool = False self.last_refit_iteration_number = None self.categorical_zero_cols_idx_to_delete_ = None self.dof_ = 0 self.partial_hat_matrix_ = 0 self.regressor_standard_error_ = 0 # When LassoCV is used as part of RERF, it cannot reasonably compute the upper and lower bounds on its input space dimensions, # as they are a polynomial combination of inputs to RERF. Thus, it approximates them with the empirical min and max. # These approximations are biased: the lower bound is too large, the upper bound is too small. # Consequently, during scoring, LassoCV is likely to see input outside of these bounds, but we still want # LassoCV to produce predictions for those points. So we introduce a little hack: whenever LassoCV is instantiated as part of RERF, # it should skip input filtering on predict. This field, controls this behavior. self.skip_input_filtering_on_predict = False
def __init__( self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger: logging.Logger = None ): if logger is None: logger = create_logger("RegressionEnhancedRandomForestRegressionModel") self.logger = logger assert model_config in regression_enhanced_random_forest_config_store.parameter_space RegressionModel.__init__( self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space ) self.model_config = model_config self.model_config.perform_initial_root_model_hyper_parameter_search = True # enforce model_config constraints (needed by sklearn regression model classes) # For .lasso_regression_model_config.fit_intercept, the intercept term in added in the design_matrix construction # For .lasso_regression_model_config.normalize, since the random forest would also need the scaled features, # scaling would have to be managed by ReRF directly model_config.lasso_regression_model_config.fit_intercept = False model_config.lasso_regression_model_config.normalize = False if model_config.sklearn_random_forest_regression_model_config.oob_score: model_config.sklearn_random_forest_regression_model_config.bootstrap = True # Explode continuous dimensions to polynomial features up to model config specified monomial degree # am using include_bias to produce constant term (all 1s) column to simplify one hot encoding logic self.polynomial_features_adapter = ContinuousToPolynomialBasisHypergridAdapter( adaptee=input_space, degree=self.model_config.max_basis_function_degree, include_bias=True, interaction_only=False ) # one hot encode categorical input dimensions self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter( adaptee=self.polynomial_features_adapter, merge_all_categorical_dimensions=True, drop='first' ) self.input_dimension_names = [dimension.name for dimension in self.input_space.dimensions] self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions if isinstance(dimension, ContinuousDimension)] self.output_dimension_names = [dimension.name for dimension in self.output_space.dimensions] self.base_regressor_ = None self.random_forest_regressor_ = None self.x_is_design_matrix = False self.random_forest_kwargs = None self.root_model_kwargs = None self.detected_feature_indices_ = None self.screening_root_model_coef_ = None self.fit_X_ = None self.partial_hat_matrix_ = None self.base_regressor_standard_error_ = None self.dof_ = None self.variance_estimate_ = None self.root_model_gradient_coef_ = None self.polynomial_features_powers_ = None self.categorical_zero_cols_idx_to_delete_ = None self._trained = False self.last_refit_iteration_number = None
def __init__( self, model_config: RegressionEnhancedRandomForestRegressionModelConfig, input_space: Hypergrid, output_space: Hypergrid, logger=None ): if logger is None: logger = create_logger("RegressionEnhancedRandomForestRegressionModel") self.logger = logger assert RegressionEnhancedRandomForestRegressionModelConfig.contains(model_config) RegressionModel.__init__( self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space ) self.input_dimension_names = [dimension.name for dimension in self.input_space.dimensions] self.output_dimension_names = [dimension.name for dimension in self.output_space.dimensions] self._input_space_dimension_name_mappings = { dimension.name: Dimension.flatten_dimension_name(dimension.name) for dimension in self.input_space.dimensions } self._output_space_dimension_name_mappings = { dimension.name: Dimension.flatten_dimension_name(dimension.name) for dimension in self.output_space.dimensions } self.base_regressor_ = None self.base_regressor_config = dict() self.base_regressor_config = self.model_config.boosting_root_model_config if self.model_config.boosting_root_model_name == SklearnLassoRegressionModelConfig.__name__: self.base_regressor_ = linear_model.Lasso( alpha=self.base_regressor_config.alpha, fit_intercept=self.base_regressor_config.fit_intercept, normalize=self.base_regressor_config.normalize, precompute=self.base_regressor_config.precompute, copy_X=self.base_regressor_config.copy_x, max_iter=self.base_regressor_config.max_iter, tol=self.base_regressor_config.tol, warm_start=self.base_regressor_config.warm_start, positive=self.base_regressor_config.positive, random_state=self.base_regressor_config.random_state, selection=self.base_regressor_config.selection ) elif self.model_config.boosting_root_model_name == SklearnRidgeRegressionModelConfig.__name__: self.base_regressor_ = linear_model.Ridge( alpha=self.base_regressor_config.alpha, fit_intercept=self.base_regressor_config.fit_intercept, normalize=self.base_regressor_config.normalize, copy_X=self.base_regressor_config.copy_x, max_iter=self.base_regressor_config.max_iter, tol=self.base_regressor_config.tol, random_state=self.base_regressor_config.random_state, solver=self.base_regressor_config.solver ) else: self.logger('Boosting base model name "{0}" not supported currently.' \ .format(self.model_config.boosting_root_model_name)) rf_config = self.model_config.random_forest_model_config self.random_forest_regressor_ = RandomForestRegressor( n_estimators=rf_config.n_estimators, criterion=rf_config.criterion, max_depth=rf_config.max_depth_value, min_samples_split=rf_config.min_samples_split, min_samples_leaf=rf_config.min_samples_leaf, min_weight_fraction_leaf=rf_config.min_weight_fraction_leaf, max_features=rf_config.max_features, max_leaf_nodes=rf_config.max_leaf_nodes_value, min_impurity_decrease=rf_config.min_impurity_decrease, bootstrap=rf_config.bootstrap, oob_score=rf_config.oob_score, n_jobs=rf_config.n_jobs, warm_start=rf_config.warm_start, ccp_alpha=rf_config.ccp_alpha, max_samples=rf_config.max_sample_value ) # set up basis feature transform self.polynomial_features_transform_ = None if self.model_config.max_basis_function_degree > 1: self.polynomial_features_transform_ = \ PolynomialFeatures(degree=self.model_config.max_basis_function_degree) self.random_forest_kwargs = None self.root_model_kwargs = None self.detected_feature_indices_ = None self.screening_root_model_coef_ = None self.fit_X_ = None self.partial_hat_matrix_ = None self.base_regressor_standard_error_ = None self.dof_ = None self.variance_estimate_ = None self.root_model_gradient_coef_ = None