def test_options_constructor(self):
		fopts = reg.forest_opts()
		fopts.num_trees = 16
		fopts.num_data_points_per_tree = self.data.num_data_points()		

		the_forest = self.forest_constructor(fopts)
		the_forest.fit(self.data, self.rng)
Пример #2
0
    def test_options_member(self):

        the_forest = self.forest_constructor()
        fopts = reg.forest_opts()
        the_forest.options.num_trees = 7
        the_forest.options.num_data_points_per_tree = self.data.num_data_points(
        )
        the_forest.fit(self.data, self.rng)

        self.assertEqual(the_forest.num_trees(), 7)
    def test_options_constructor(self):
        fopts = reg.forest_opts()
        fopts.num_trees = 16
        fopts.num_data_points_per_tree = self.data.num_data_points()

        the_forest = self.forest_constructor(fopts)
        self.assertEqual(the_forest.num_trees(), 0)
        the_forest.fit(self.data, self.rng)

        self.assertEqual(the_forest.num_trees(), 16)
        the_forest.predict(self.data.retrieve_data_point(0))
Пример #4
0
    def __init__(
        self,
        configspace: ConfigurationSpace,
        types: typing.List[int],
        bounds: typing.List[typing.Tuple[float, float]],
        seed: int,
        log_y: bool = False,
        num_trees: int = N_TREES,
        do_bootstrapping: bool = True,
        n_points_per_tree: int = -1,
        ratio_features: float = 5. / 6.,
        min_samples_split: int = 3,
        min_samples_leaf: int = 3,
        max_depth: int = 2**20,
        eps_purity: float = 1e-8,
        max_num_nodes: int = 2**20,
        instance_features: typing.Optional[np.ndarray] = None,
        pca_components: typing.Optional[int] = None,
    ) -> None:
        super().__init__(
            configspace=configspace,
            types=types,
            bounds=bounds,
            seed=seed,
            instance_features=instance_features,
            pca_components=pca_components,
        )

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(len(types) * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, self.seed
        ]
Пример #5
0
    def __init__(self,
                 types,
                 bounds,
                 num_trees=10,
                 do_bootstrapping=True,
                 n_points_per_tree=-1,
                 ratio_features=5. / 6.,
                 min_samples_split=3,
                 min_samples_leaf=3,
                 max_depth=20,
                 eps_purity=1e-8,
                 max_num_nodes=1000,
                 seed=42,
                 **kwargs):

        super().__init__(**kwargs)

        self.types = types
        self.bounds = bounds
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.seed = seed
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features >= 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.max_features = max_features
        self.rf_opts.min_samples_to_split = min_samples_split
        self.rf_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.max_depth = max_depth
        self.rf_opts.epsilon_purity = eps_purity
        self.rf_opts.max_num_nodes = max_num_nodes

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Пример #6
0
    def _set_conf(
        self,
        c: Configuration,
        n_features: int,
        num_data_points: int,
    ) -> regression.forest_opts:
        """Transform a Configuration object a forest_opts object.

        Parameters
        ----------
        c : Configuration
            Hyperparameter configurations
        n_features : int
            Number of features used to calculate the feature subset in the random forest.
        num_data_points : int
            Number of data points (required by the random forest).

        Returns
        -------
        pyrfr.regression.rf_opts
        """

        rf_opts = regression.forest_opts()
        rf_opts.num_trees = c["num_trees"]
        rf_opts.do_bootstrapping = c["do_bootstrapping"]
        rf_opts.tree_opts.max_num_nodes = 2 ** 20

        rf_opts.tree_opts.max_features = max(1, int(np.rint(n_features * c["max_features"])))
        rf_opts.tree_opts.min_samples_to_split = int(
            c["min_samples_to_split"])
        rf_opts.tree_opts.min_samples_in_leaf = c["min_samples_in_leaf"]
        rf_opts.tree_opts.max_depth = MAX_DEPTH
        rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES

        if N_POINTS_PER_TREE <= 0:
            rf_opts.num_data_points_per_tree = num_data_points
        else:
            raise ValueError()

        return rf_opts
Пример #7
0
    def __init__(self,
                 types: np.ndarray,
                 bounds: typing.List[typing.Tuple[float, float]],
                 log_y: bool = False,
                 num_trees: int = N_TREES,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = -1,
                 ratio_features: float = 5. / 6.,
                 min_samples_split: int = 3,
                 min_samples_leaf: int = 3,
                 max_depth: int = 2**20,
                 eps_purity: float = 1e-8,
                 max_num_nodes: int = 2**20,
                 seed: int = 42,
                 **kwargs):
        """
        Parameters
        ----------
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : list
            Specifies the bounds for continuous features.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        seed : int
            The seed that is passed to the random_forest_run library.
        """
        super().__init__(types, bounds, **kwargs)

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Пример #8
0
    def __init__(
        self,
        configspace: ConfigurationSpace,
        types: typing.List[int],
        bounds: typing.List[typing.Tuple[float, float]],
        seed: int,
        log_y: bool = False,
        num_trees: int = N_TREES,
        do_bootstrapping: bool = True,
        n_points_per_tree: int = -1,
        ratio_features: float = 5. / 6.,
        min_samples_split: int = 3,
        min_samples_leaf: int = 3,
        max_depth: int = 2**20,
        eps_purity: float = 1e-8,
        max_num_nodes: int = 2**20,
        instance_features: typing.Optional[np.ndarray] = None,
        pca_components: typing.Optional[int] = None,
    ) -> None:
        """
        Parameters
        ----------
        types : List[int]
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass [3, 0]. Note that we count starting from 0.
        bounds : List[Tuple[float, float]]
            bounds of input dimensions: (lower, uppper) for continuous dims; (n_cat, np.nan) for categorical dims
        seed : int
            The seed that is passed to the random_forest_run library.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        instance_features : np.ndarray (I, K)
            Contains the K dimensional instance features of the I different instances
        pca_components : float
            Number of components to keep when using PCA to reduce dimensionality of instance features. Requires to
            set n_feats (> pca_dims).
        """
        super().__init__(
            configspace=configspace,
            types=types,
            bounds=bounds,
            seed=seed,
            instance_features=instance_features,
            pca_components=pca_components,
        )

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(len(types) * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [num_trees, max_num_nodes, do_bootstrapping,
                       n_points_per_tree, ratio_features, min_samples_split,
                       min_samples_leaf, max_depth, eps_purity, self.seed]
Пример #9
0
    def __init__(self,
                 configspace,
                 types: np.ndarray,
                 bounds: np.ndarray,
                 seed: int,
                 num_trees: int = 10,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = -1,
                 ratio_features: float = 5. / 6.,
                 min_samples_split: int = 3,
                 min_samples_leaf: int = 3,
                 max_depth: int = 20,
                 eps_purity: int = 1e-8,
                 max_num_nodes: int = 2**20,
                 logged_y: bool = True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        configspace: ConfigurationSpace
            configspace to be passed to random forest (used to impute inactive parameter-values)
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : np.ndarray (D, 2)
            Specifies the bounds for continuous features.
        seed : int
            The seed that is passed to the random_forest_run library.
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        logged_y: bool
            Indicates if the y data is transformed (i.e. put on logscale) or not
        """
        super().__init__(configspace=configspace,
                         types=types,
                         bounds=bounds,
                         seed=seed,
                         **kwargs)

        self.configspace = configspace
        self.types = types
        self.bounds = bounds
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False  # Always off. No need for this in our base EPM

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest
        self.logged_y = logged_y

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.impute_values = {}

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Пример #10
0
    def __init__(
        self,
        types: np.ndarray,
        bounds: typing.List[typing.Tuple[float, float]],
        log_y: bool=False,
        bootstrap: bool=False,
        n_iters: int=50,
        n_splits: int=10,
        seed: int=42,
    ):
        """Parameters
        ----------
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : np.ndarray (D, 2)
            Specifies the bounds for continuous features.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        bootstrap : bool
            Turns on / off bootstrapping in the random forest.
        n_iters : int
            Number of iterations for random search.
        n_splits : int
            Number of cross-validation splits.
        seed : int
            The seed that is passed to the random_forest_run library.
        """
        super().__init__(
            types,
            bounds,
            log_y,
            num_trees=N_TREES,
            do_bootstrapping=bootstrap,
            n_points_per_tree=N_POINTS_PER_TREE,
            ratio_features=5/6,
            min_samples_split=3,
            min_samples_leaf=3,
            max_depth=MAX_DEPTH,
            eps_purity=EPSILON_IMPURITY,
            max_num_nodes=MAX_NUM_NODES,
            seed=seed,
        )

        self.types = types
        self.bounds = bounds
        self.log_y = log_y
        self.n_iters = n_iters
        self.n_splits = n_splits
        self.rng = regression.default_random_engine(seed)
        self.rs = np.random.RandomState(seed)
        self.bootstrap = bootstrap

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = N_TREES
        self.rf_opts.compute_oob_error = True
        self.rf_opts.do_bootstrapping = self.bootstrap
        self.rf_opts.tree_opts.max_features = int(types.shape[0])
        self.rf_opts.tree_opts.min_samples_to_split = 2
        self.rf_opts.tree_opts.min_samples_in_leaf = 1
        self.rf_opts.tree_opts.max_depth = MAX_DEPTH
        self.rf_opts.tree_opts.epsilon_purity = EPSILON_IMPURITY
        self.rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES
        self.rf_opts.compute_law_of_total_variance = False

        self.rf = None  # type: regression.binary_rss_forest

        # This list will be read out by save_iteration() in the solver
        self._set_hypers(self._get_configuration_space().get_default_configuration())
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Пример #11
0
    cs = ConfigurationSpace()
    learning_rate = UniformFloatHyperparameter("learning_rate", 1e-4, 5e-3, default_value=3e-4)
    cs.add_hyperparameter(learning_rate)

    n_layer1 = UniformIntegerHyperparameter("n_layer1", 5, 50, default_value=32)
    cs.add_hyperparameter(n_layer1)

    n_layer2 = UniformIntegerHyperparameter("n_layer2", 30, 80, default_value=64)
    cs.add_hyperparameter(n_layer2)

    batch_size = UniformIntegerHyperparameter("batch_size", 10, 500, default_value=200)
    cs.add_hyperparameter(batch_size)

    types, bounds = get_types(cs)
    reg = regression.binary_rss_forest()
    rf_opts = regression.forest_opts()
    rf_opts.num_trees = 10
    rf_opts.do_bootstrapping = True

    model = RandomForestWithInstances(types=types, bounds=bounds)
    x = np.array([[0.78105907, 0.33860037, 0.72826097, 0.02941158],
                  [0.81160897, 0.63147998, 0.72826097, 0.04901943],
                  [0.27800406, 0.36616871, 0.16304333, 0.24509794],
                  [0.41242362, 0.37351241, 0.11956505, 0.4607843],
                  [0.70162934, 0.15819312, 0.51086957, 0.10784298],
                  [0.53869654, 0.86662495, 0.27173903, 0.22549009],
                  [0.53665988, 0.68576624, 0.81521753, 0.06862728],
                  [0.72199594, 0.18900731, 0.75000011, 0.36274504]], dtype=np.float64)
    y = np.array([0.544481, 2.34456, 0.654629, 0.576376, 0.603501, 0.506214, 0.416664, 0.483639])
    print(x.dtype)
    rf_opts.num_data_points_per_tree = x.shape[0]