def test_options_constructor(self): fopts = reg.forest_opts() fopts.num_trees = 16 fopts.num_data_points_per_tree = self.data.num_data_points() the_forest = self.forest_constructor(fopts) the_forest.fit(self.data, self.rng)
def test_options_member(self): the_forest = self.forest_constructor() fopts = reg.forest_opts() the_forest.options.num_trees = 7 the_forest.options.num_data_points_per_tree = self.data.num_data_points( ) the_forest.fit(self.data, self.rng) self.assertEqual(the_forest.num_trees(), 7)
def test_options_constructor(self): fopts = reg.forest_opts() fopts.num_trees = 16 fopts.num_data_points_per_tree = self.data.num_data_points() the_forest = self.forest_constructor(fopts) self.assertEqual(the_forest.num_trees(), 0) the_forest.fit(self.data, self.rng) self.assertEqual(the_forest.num_trees(), 16) the_forest.predict(self.data.retrieve_data_point(0))
def __init__( self, configspace: ConfigurationSpace, types: typing.List[int], bounds: typing.List[typing.Tuple[float, float]], seed: int, log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, instance_features: typing.Optional[np.ndarray] = None, pca_components: typing.Optional[int] = None, ) -> None: super().__init__( configspace=configspace, types=types, bounds=bounds, seed=seed, instance_features=instance_features, pca_components=pca_components, ) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(len(types) * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, self.seed ]
def __init__(self, types, bounds, num_trees=10, do_bootstrapping=True, n_points_per_tree=-1, ratio_features=5. / 6., min_samples_split=3, min_samples_leaf=3, max_depth=20, eps_purity=1e-8, max_num_nodes=1000, seed=42, **kwargs): super().__init__(**kwargs) self.types = types self.bounds = bounds self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.seed = seed self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features >= 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.max_features = max_features self.rf_opts.min_samples_to_split = min_samples_split self.rf_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.max_depth = max_depth self.rf_opts.epsilon_purity = eps_purity self.rf_opts.max_num_nodes = max_num_nodes self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def _set_conf( self, c: Configuration, n_features: int, num_data_points: int, ) -> regression.forest_opts: """Transform a Configuration object a forest_opts object. Parameters ---------- c : Configuration Hyperparameter configurations n_features : int Number of features used to calculate the feature subset in the random forest. num_data_points : int Number of data points (required by the random forest). Returns ------- pyrfr.regression.rf_opts """ rf_opts = regression.forest_opts() rf_opts.num_trees = c["num_trees"] rf_opts.do_bootstrapping = c["do_bootstrapping"] rf_opts.tree_opts.max_num_nodes = 2 ** 20 rf_opts.tree_opts.max_features = max(1, int(np.rint(n_features * c["max_features"]))) rf_opts.tree_opts.min_samples_to_split = int( c["min_samples_to_split"]) rf_opts.tree_opts.min_samples_in_leaf = c["min_samples_in_leaf"] rf_opts.tree_opts.max_depth = MAX_DEPTH rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES if N_POINTS_PER_TREE <= 0: rf_opts.num_data_points_per_tree = num_data_points else: raise ValueError() return rf_opts
def __init__(self, types: np.ndarray, bounds: typing.List[typing.Tuple[float, float]], log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, seed: int = 42, **kwargs): """ Parameters ---------- types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : list Specifies the bounds for continuous features. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree seed : int The seed that is passed to the random_forest_run library. """ super().__init__(types, bounds, **kwargs) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def __init__( self, configspace: ConfigurationSpace, types: typing.List[int], bounds: typing.List[typing.Tuple[float, float]], seed: int, log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, instance_features: typing.Optional[np.ndarray] = None, pca_components: typing.Optional[int] = None, ) -> None: """ Parameters ---------- types : List[int] Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass [3, 0]. Note that we count starting from 0. bounds : List[Tuple[float, float]] bounds of input dimensions: (lower, uppper) for continuous dims; (n_cat, np.nan) for categorical dims seed : int The seed that is passed to the random_forest_run library. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree instance_features : np.ndarray (I, K) Contains the K dimensional instance features of the I different instances pca_components : float Number of components to keep when using PCA to reduce dimensionality of instance features. Requires to set n_feats (> pca_dims). """ super().__init__( configspace=configspace, types=types, bounds=bounds, seed=seed, instance_features=instance_features, pca_components=pca_components, ) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(len(types) * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, self.seed]
def __init__(self, configspace, types: np.ndarray, bounds: np.ndarray, seed: int, num_trees: int = 10, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 20, eps_purity: int = 1e-8, max_num_nodes: int = 2**20, logged_y: bool = True, **kwargs): """Constructor Parameters ---------- configspace: ConfigurationSpace configspace to be passed to random forest (used to impute inactive parameter-values) types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : np.ndarray (D, 2) Specifies the bounds for continuous features. seed : int The seed that is passed to the random_forest_run library. num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree logged_y: bool Indicates if the y data is transformed (i.e. put on logscale) or not """ super().__init__(configspace=configspace, types=types, bounds=bounds, seed=seed, **kwargs) self.configspace = configspace self.types = types self.bounds = bounds self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False # Always off. No need for this in our base EPM self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest self.logged_y = logged_y # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.impute_values = {} self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def __init__( self, types: np.ndarray, bounds: typing.List[typing.Tuple[float, float]], log_y: bool=False, bootstrap: bool=False, n_iters: int=50, n_splits: int=10, seed: int=42, ): """Parameters ---------- types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : np.ndarray (D, 2) Specifies the bounds for continuous features. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting bootstrap : bool Turns on / off bootstrapping in the random forest. n_iters : int Number of iterations for random search. n_splits : int Number of cross-validation splits. seed : int The seed that is passed to the random_forest_run library. """ super().__init__( types, bounds, log_y, num_trees=N_TREES, do_bootstrapping=bootstrap, n_points_per_tree=N_POINTS_PER_TREE, ratio_features=5/6, min_samples_split=3, min_samples_leaf=3, max_depth=MAX_DEPTH, eps_purity=EPSILON_IMPURITY, max_num_nodes=MAX_NUM_NODES, seed=seed, ) self.types = types self.bounds = bounds self.log_y = log_y self.n_iters = n_iters self.n_splits = n_splits self.rng = regression.default_random_engine(seed) self.rs = np.random.RandomState(seed) self.bootstrap = bootstrap self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = N_TREES self.rf_opts.compute_oob_error = True self.rf_opts.do_bootstrapping = self.bootstrap self.rf_opts.tree_opts.max_features = int(types.shape[0]) self.rf_opts.tree_opts.min_samples_to_split = 2 self.rf_opts.tree_opts.min_samples_in_leaf = 1 self.rf_opts.tree_opts.max_depth = MAX_DEPTH self.rf_opts.tree_opts.epsilon_purity = EPSILON_IMPURITY self.rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES self.rf_opts.compute_law_of_total_variance = False self.rf = None # type: regression.binary_rss_forest # This list will be read out by save_iteration() in the solver self._set_hypers(self._get_configuration_space().get_default_configuration()) self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
cs = ConfigurationSpace() learning_rate = UniformFloatHyperparameter("learning_rate", 1e-4, 5e-3, default_value=3e-4) cs.add_hyperparameter(learning_rate) n_layer1 = UniformIntegerHyperparameter("n_layer1", 5, 50, default_value=32) cs.add_hyperparameter(n_layer1) n_layer2 = UniformIntegerHyperparameter("n_layer2", 30, 80, default_value=64) cs.add_hyperparameter(n_layer2) batch_size = UniformIntegerHyperparameter("batch_size", 10, 500, default_value=200) cs.add_hyperparameter(batch_size) types, bounds = get_types(cs) reg = regression.binary_rss_forest() rf_opts = regression.forest_opts() rf_opts.num_trees = 10 rf_opts.do_bootstrapping = True model = RandomForestWithInstances(types=types, bounds=bounds) x = np.array([[0.78105907, 0.33860037, 0.72826097, 0.02941158], [0.81160897, 0.63147998, 0.72826097, 0.04901943], [0.27800406, 0.36616871, 0.16304333, 0.24509794], [0.41242362, 0.37351241, 0.11956505, 0.4607843], [0.70162934, 0.15819312, 0.51086957, 0.10784298], [0.53869654, 0.86662495, 0.27173903, 0.22549009], [0.53665988, 0.68576624, 0.81521753, 0.06862728], [0.72199594, 0.18900731, 0.75000011, 0.36274504]], dtype=np.float64) y = np.array([0.544481, 2.34456, 0.654629, 0.576376, 0.603501, 0.506214, 0.416664, 0.483639]) print(x.dtype) rf_opts.num_data_points_per_tree = x.shape[0]