Пример #1
0
def test_real():
    """Tests for real scalars."""

    assert params.real(1) == 1.0
    assert params.real(-2.0) == -2.0

    for arg in (True, False):
        with pytest.raises(InvalidParameterError):
            params.real(arg)
Пример #2
0
    def shaded_line(
        self,
        positions: np.ndarray,
        values: List[np.ndarray],
        color_idx: int = 0,
        label: Optional[str] = None,
        quantile_width: float = 0.5,
        alpha: float = 0.2,
        show_extrema: bool = True,
        **kwargs,
    ):
        """Draw a line plot with shaded quantiles.

        Parameters:
            positions: 1-d array of point locations on the horizontal axis
            values: list of arrays, each one containing all of the values at a given location.
                len(values) must equal len(positions)
            color_idx: color index
            label: line label
            quantile_width: fraction of the range to shade. For the default value, 0.5,
                shade from the 25th percentile to the 75th percentile.
            alpha: shading alpha level
            show_extrema: whether or not to draw dashed lines at the best/worst point
        """
        positions = params.real_vector(positions)
        values = params.tuple_(values, params.real_vector, arity=len(positions))
        color_idx = params.integer(color_idx, from_=0, below=len(self.configuration.color_set))
        quantile_width = params.real(quantile_width, from_=0, to=1)
        alpha = params.real(alpha, from_=0, to=1)

        color = self.configuration.color(color_idx)
        lower_bound = 0.5 - quantile_width / 2.0
        upper_bound = 0.5 + quantile_width / 2.0

        median = [np.median(samples) for samples in values]
        lower_shading = [np.quantile(samples, lower_bound) for samples in values]
        upper_shading = [np.quantile(samples, upper_bound) for samples in values]

        self.ax.plot(positions, median, linestyle="-", color=color, label=label, **kwargs)
        self.ax.fill_between(
            positions,
            lower_shading,
            upper_shading,
            color=color,
            alpha=alpha,
            **kwargs,
        )

        if show_extrema:
            min_val = [np.min(samples) for samples in values]
            max_val = [np.max(samples) for samples in values]
            self.ax.plot(positions, min_val, linestyle="--", color=color, **kwargs)
            self.ax.plot(positions, max_val, linestyle="--", color=color, **kwargs)
Пример #3
0
    def __init__(
        self,
        rng: int = None,
        num_seeds: int = 1,
        resolution: int = 64,
        max_relative_jump: float = 1.0,
        dimensions_varied: Union[str, float, int] = "all",
        max_iters: Optional[int] = None,
        max_evals: Optional[int] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            rng: pseudo-random number generator seed
            num_seeds: the number of starting points, and the number of points chosen at the end
                of each iteration
            resolution: the number of points to sample along a single dimension for a single seed
            max_relative_jump: the maximum relative step size along a single dimension. If a given
                dimension has length `L` and a seed has value `x` along that dimension, then the
                candidates are `resolution` linearly spaced points from the range
                [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds).
                `max_relative_jump must be on (0, 1].
                For a value of 1, the entire range is always considered.
            dimensions_varied: how many randomly selected dimensions to explore with each step.
                'all' indicates all dimensions. An integer directly specifies the number of
                dimensions. A float on (0, 1) indicates the fractional number of the total.
            max_iters: the maximum number of iterations
            max_evals: the maximum number of function evaluations (this is a soft maximum:
                once it is reached then the current iteration finishes)

        TODO: add tolerance stopping conditions
        """
        super().__init__(rng=rng, **kwargs)

        self._num_seeds = params.integer(num_seeds, from_=1)
        self._resolution = params.integer(resolution, from_=2)
        self._max_relative_jump = params.real(max_relative_jump,
                                              above=0.0,
                                              to=1.0)
        self._dimensions_varied = params.any_(
            dimensions_varied,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, below=1.0),
            lambda arg: params.enumeration(arg, {"all"}),
        )
        self._max_iters = params.optional_(
            max_iters, lambda arg: params.integer(arg, from_=1))
        self._max_evals = params.optional_(
            max_evals, lambda arg: params.integer(arg, from_=1))
        if self._max_iters is None and self._max_evals is None:
            raise InvalidParameterError(
                "at least one stopping condition defined", "all Nones")
Пример #4
0
    def __init__(self, mean=0.0, stddev=1.0, **kwargs):
        """Initialize state.

        Parameters:
            mean: mean of the normal distribution
            stddev: standard deviation of the normal distribution
            All parameters from base class 'Noise' initializer
        """

        super().__init__(**kwargs)

        self._mean = params.real(mean)
        self._stddev = params.real(stddev, above=0)
Пример #5
0
    def __init__(self,
                 rng: int = None,
                 maxiter: int = 1000,
                 local_search_options: Optional[dict] = None,
                 initial_temp: float = 5230.0,
                 restart_temp_ratio: float = 2e-05,
                 visit: float = 2.62,
                 accept: float = -5.0,
                 maxfun: int = 1e7,
                 no_local_search: bool = False,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            maxiter: The maximum number of iterations, where one iteration is one round of
                simulated annealing followed by one use of a local optimizer to find a local min.
            local_search_options: an optional kwargs dictionary to pass to the local minimizer,
                scipy.optimize.minimize: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html
                If no args are passed then the minimizer defaults to the L-BFGS-B method, since
                the problems being studied have bounds but no constraints.
            initial_temp: The initial temperature, use higher values to facilitates a wider search
                and more easily escape local minima.
            restart_temp_ratio: The temperature, relative to the initial temperature, at which
                the annealing process restarts.
            visit: a parameter of the visiting distribution. A higher value corresponds to a
                heavier tail and longer potential jumps.
            accept: a parameter of the acceptance distribution. A lower value means that uphill
                moves are less likely to be accepted.
            maxfun: soft limit for the total number of function evaluation calls that may be exceeded only during a local optimization step if the quota is reached therein.
            no_local_search: if true then the local search step is skipped, and this reduces
                 to a generalized simulated annealing optimizer.
        """
        super().__init__(rng=rng, **kwargs)

        self._maxiter = params.integer(maxiter, from_=1)
        self._local_search_options = local_search_options or {
        }  # TODO: verify dictionaries
        self._initial_temp = params.real(initial_temp, above=0.01, to=5e4)
        self._restart_temp_ratio = params.real(restart_temp_ratio,
                                               above=0.0,
                                               below=1.0)
        self._visit = params.real(visit, above=0.0, to=3.0)
        self._accept = params.real(accept, above=-1e4, to=-5.0)
        self._maxfun = params.integer(maxfun, from_=1)
        self._no_local_search = params.boolean(no_local_search)
Пример #6
0
    def __init__(
        self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs
    ):
        """Initialize generalized function plot.

        Parameters:
            visualization_type: how to visualize generalized functions.
                Either single value or list of appropriate length.
                Possible values: "points" (default), "box-whisker", "shaded-line"
            rectify: whether and by how much each curves' values will be horizontally displaced
                to visually disentangle markers from different curves at the same location.
                True indicates automatic displacement, False indicates no displacement.
                If not specified, horizontal axis positions are not modified (default).
                If the horizontal axis scaling is logarithmic, the rectification factor
                is applied in log-space.

        Examples:
            # show three curves with automatic horizontal rectification
            __init__(visualization_type=("points", "points", "box-whisker"), rectify=True)
        """

        super().__init__(**kwargs)

        # parameter validation

        enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"})
        self._visualization_type = params.any_(
            visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f)
        )
        # arity can only be tested in evaluate()

        self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
Пример #7
0
    def __init__(self,
                 internal_hp_optimization: bool = True,
                 kernel: Optional[Kernel] = None,
                 alpha: Union[float, Sequence] = 1e-5,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 normalize_y=False,
                 random_state: int = None,
                 **kwargs):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            internal_hp_optimization: if True, hyperparameters are optimized "internally"
                by the Gaussian process, that is, scikit-learn optimizes hyperparameters
                and for smlb the learner has no hyperparameters;
                if False, hyperparameters are optimized by smlb (and scikit-learn does
                not optimize any hyperparameters)
            kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default
            alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal.
                   Equivalent to adding a "WhiteKernel"; the default is the corresponding value from
                   scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor.
            optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True
            n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True
            normalize_y: whether to subtract the mean of the labels
            random_state: integer seed

        See skl.gaussian_process.GaussianProcessRegressor parameters.
        """

        super().__init__(**kwargs)

        internal_hp_optimization = params.boolean(internal_hp_optimization)
        kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel),
                             params.none)
        # incomplete check for alpha as dimension becomes known only at fitting time
        alpha = params.any_(
            alpha,
            lambda arg: params.real(arg, from_=0),
            lambda arg: params.real_vector(arg, domain=[0, np.inf]),
        )
        # todo: check optimizer, requires params.union (of string and callable) and params.function
        normalize_y = params.boolean(normalize_y)
        random_state = params.integer(random_state)

        if kernel is None:
            kernel = skl.gaussian_process.kernels.RBF(
            ) + skl.gaussian_process.kernels.WhiteKernel()

        assert internal_hp_optimization is True  # external HP optimization not yet supported

        self._model = skl.gaussian_process.GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            random_state=random_state,
        )
Пример #8
0
    def __init__(
        self,
        optimizer_names: Optional[List[str]] = None,
        log_scale: bool = False,
        quantile_width: float = 0.5,
        show_extrama: bool = True,
        **kwargs,
    ):
        self._optimizer_names = params.optional_(
            optimizer_names, lambda arg: params.sequence(arg, type_=str)
        )
        self._show_extrema = params.boolean(show_extrama)
        log_scale = params.boolean(log_scale)
        scale = "log" if log_scale else "linear"

        self._quantile_width = params.real(quantile_width, from_=0, to=1)

        kwargs["axes_scales"] = kwargs.get("axes_scales", (scale, "linear"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("function evaluations", "best score", None, None)
        )
        kwargs["rectify"] = False
        kwargs["visualization_type"] = "shaded-line"

        super().__init__(**kwargs)
Пример #9
0
    def __init__(self, target: float, goal: str = "maximize", **kwargs):
        super().__init__(**kwargs)

        self._target = params.real(target)
        goal = params.enumeration(goal, {"maximize", "minimize"})
        if goal == "maximize":
            self._direction = 1
        elif goal == "minimize":
            self._direction = -1
Пример #10
0
    def __init__(self, value: float = 0, **kwargs):
        """Initialize state.

        Parameters:
            value: constant that will be returned
            All parameters from base class 'Noise' initializer
        """

        super().__init__(**kwargs)

        self._value = params.real(value)
Пример #11
0
    def __init__(self, bias_correction: float = 0, **kwargs):
        """Initialize metric.

        Parameters:
            bias_correction: no correction by default. if a positive value d is given,
                division is by n-d. Bessel's correction (d=1) is unbiased for variance
                estimators, but not for standard deviation estimators. While there is
                no value that works across all distributions, d=1.5 is a reasonably
                good correction.
        """

        self._bias_correction = params.real(bias_correction, from_=0)

        super().__init__(**kwargs)
Пример #12
0
    def __init__(
        self,
        fits: bool = True,
        fit_lambda: float = 1e-7,
        fit_weights: Optional[str] = None,
        base=10,
        **kwargs,
    ):
        """Initialize learning curve plot.

        Parameters:
            fits: if True, show estimated asymptotic fits
            fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7
            fit_weights: if and how to weight fits; one of
                None: no weighting, "variance": weigh by variance for each training set size
            base: base for logarithmic plotting
            All parameters from base classes, in particular GeneralizedFunctionPlot and Plot.
        """

        # set learning curve-specific arguments if not explicitly set
        kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("training set size", "evaluation metric", None, None)
        )

        super().__init__(**kwargs)

        # parameters
        self._fits = params.boolean(fits)
        self._fit_lambda = params.real(fit_lambda, from_=0)
        self._fit_weights = params.any_(
            fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none
        )
        self._base = params.real(base, from_=2)

        self._logf = lambda x: np.log(x) / np.log(self._base)
        self._powf = lambda x: np.power(self._base, x)
Пример #13
0
    def evaluate(self, results, **kwargs):
        """Evaluate learning curve plot.

        Parameters:
            results: sequence of curve data, where each curve datum is a sequence of tuples (n,fx)
                of training set size n (positive integer) and performance values fx (sequence of real numbers).
        """

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg,
                                                lambda arg: params.real(
                                                    arg, above=0),
                                                params.real_vector,
                                                arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        super().evaluate(results=results, **kwargs)

        ypowf = self._powf if self.axes_scales[1] == "log" else lambda arg: arg

        # asymptotic estimates
        if self._fits:
            asymptotic_fits = tuple(
                self.asymptotic_fit(fdata) for fdata in results)

            all_sizes = np.unique(
                [entry[0] for fdata in results for entry in fdata])
            sizes = np.linspace(start=np.min(all_sizes),
                                stop=np.max(all_sizes),
                                num=25)
            self._fit_data = np.empty(shape=(len(results), 2, len(sizes)))
            for i, (offset, slope, _, _) in enumerate(asymptotic_fits):
                yvalues = [
                    ypowf(offset + slope * self._logf(n)) for n in sizes
                ]
                self._fit_data[i, 0, :] = sizes
                self._fit_data[i, 1, :] = yvalues

            self.add_auxiliary(
                "asymptotic_fits",
                tuple({
                    "offset": offset,
                    "slope": slope,
                    "residuals": residuals,
                    "variance": variance,
                } for (offset, slope, residuals, variance) in asymptotic_fits),
            )
Пример #14
0
    def _calculate_li_above(mean, stddev, target):
        """Calculate the likelihood of improvement, assuming the goal is to exceed the target.

        Parameters:
            mean: mean of the normal distribution
            stddev: standard deviation of the normal distribution
            target: value to exceed
        """
        stddev = params.real(stddev, from_=0.0)
        if stddev == 0:
            if mean > target:
                return 1.0
            else:
                return 0.0
        return 0.5 * (1 - erf((target - mean) / (stddev * math.sqrt(2))))
Пример #15
0
    def evaluate(self, results, **kwargs):
        """Evaluate optimization trajectory plot.

        Parameters:
            results: sequence of curve data, where each curve datum is a sequence of
                tuples (index, scores) of function evaluation number (positive integer)
                and best scores found after that many evaluations (sequence of real numbers).
        """
        tuple_testf = lambda arg: params.tuple_(
            arg, lambda arg: params.real(arg, above=0), params.real_vector, arity=2
        )
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        super().evaluate(results=results, **kwargs)
Пример #16
0
    def __init__(self, D, r0, a, domain=(0, np.inf), **args):
        """Initialize state.

        Parameters:
            D: potential parameter determining well depth -D
            r0: potential parameter determining location r0 of minimum
            a: potential parameter, where 1/a is proportional to well width
            domain: domain of dataset; defaults to unit [0,inf) on which the potential is defined
            All parameters from base class 'ComputedLabelsVectorSpaceData' initializer

        Raises:
            InvalidParameterError: on invalid parameter values
        """

        self._d = params.real(D, above=0)
        self._r0 = params.real(r0, above=0)
        self._a = params.real(a, above=0)

        def morsef(r):
            """Evaluate Morse potential at a sequence of vectors r.

            Parameters:
                r: n x 1 matrix of n one-dimensional vectors

            Returns:
                vector of Morse potential values at r
            """

            r = params.real_matrix(r, ncols=1)
            n = len(r)

            gamma = np.exp(-self._a * (r - self._r0))
            v = self._d * (np.square(gamma) - 2 * gamma)
            return v.reshape(n)

        super().__init__(dimensions=1, function=morsef, domain=domain, **args)
Пример #17
0
    def __init__(
        self,
        uncertainties: Optional[str] = None,
        loss: str = "ls",
        alpha: float = 0.9,
        learning_rate: float = 0.1,
        subsample: float = 1.0,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: int = 3,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = None,
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        random_state: int = None,
        ccp_alpha: float = 0.0,
        init: Optional[Any] = None,
        validation_fraction: float = 0.1,
        n_iter_no_change: Optional[int] = None,
        tol: float = 0.0001,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; possible choices are
                None; by default, RandomForestRegressor does not return any predictive uncertainties;
            loss: loss function to optimize; valid values are "ls" (least squares), "lad" (least absolute deviation),
                "huber" (Huber's loss), "quantile" (quantile regression). Use alpha parameter for huber and quantile.
            alpha: quantile for "huber" and "quantile" loss functions
            learning_rate: value by which to shrink contribution of consecutive trees; trade-off with num_estimators
            subsample: fraction of samples for fitting base learners; if <1 results in Stochastic Gradient Boosting.
                reducing subsample reduces variance and increases bias.
            n_estimators: number of decision trees
            criterion: either Friedman improved score ("friedman_rmse"), variance reduction ("mse", mean squared error),
                or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is 3
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            random_state: pseudo-random number generator seed
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            init: estimator for initial predictions; can be 'zero' for constant zero predictions
            validation_fraction: fraction of training data to set aside for early stopping; only with n_iter_no_change
            n_iter_no_change: set to integer to stop after no improvement (beyond tol) for that many rounds
            tol: tolerance for early stopping; only improvements larger than tol are considered

        The sklearn.GradientBoostingRegressor parameters `oob_score`, `verbose`, `warm_start` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(**kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None})

        loss = params.enumeration(loss, {"ls", "lad", "huber", "quantile"})
        alpha = params.real(alpha, above=0, below=1)
        learning_rate = params.real(learning_rate, above=0, to=1)
        subsample = params.real(subsample, above=0, to=1)
        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion,
                                       {"friedman_rmse", "mse", "mae"})
        max_depth = params.any_(max_depth,
                                lambda arg: params.integer(arg, from_=1),
                                params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf,
                                               from_=0.0,
                                               to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(max_leaf_nodes,
                                     lambda arg: params.integer(arg, from_=1),
                                     params.none)
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        random_state = params.integer(random_state)
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        # no validation for init (no class signature validator)
        validation_fraction = params.real(validation_fraction,
                                          above=0,
                                          below=1)
        n_iter_no_change = params.any_(
            n_iter_no_change, lambda arg: params.integer(arg, from_=0),
            params.none)
        tol = params.real(tol, from_=0)

        self._model = skl.ensemble.GradientBoostingRegressor(
            loss=loss,
            alpha=alpha,
            learning_rate=learning_rate,
            subsample=subsample,
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            random_state=random_state,
            ccp_alpha=ccp_alpha,
            init=init,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
        )
Пример #18
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Пример #19
0
    def __init__(self,
                 rng: int = None,
                 strategy: str = "best1bin",
                 maxiter: int = 1000,
                 popsize: int = 15,
                 tol: float = 0.01,
                 mutation=(0.5, 1),
                 recombination: float = 0.7,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            strategy: The differential evolution strategy to use. See documentation for complete
                list and explanations.
            maxiter: The maximum number of generations over which the entire population is evolved.
            popsize: A multiplier for setting the total population size.
            tol: Relative tolerance for convergence.
            mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max)
                in which case the mutation constant is randomly selected uniformly from between
                min and max with each generation.
            recombination: The recombination constant. Must be between 0 and 1.

        """
        super().__init__(rng=rng, **kwargs)

        allowed_strategies = {
            "best1bin",
            "best1exp",
            "rand1exp",
            "randtobest1exp",
            "currenttobest1exp",
            "best2exp",
            "rand2exp",
            "randtobest1bin",
            "currenttobest1bin",
            "best2bin",
            "rand2bin",
            "rand1bin",
        }
        self._strategy = params.enumeration(strategy, allowed_strategies)

        self._maxiter = params.integer(maxiter, from_=1)
        self._popsize = params.integer(popsize, from_=1)
        self._tol = params.real(tol, above=0.0)

        def test_mutation_range(arg, low=0.0):
            return params.real(arg, from_=low, to=2.0)

        self._mutation = params.any_(
            mutation,
            test_mutation_range,
            lambda pair: params.tuple_(
                pair,
                test_mutation_range,
                lambda arg2: test_mutation_range(arg2, low=pair[0]),
                arity=2,
            ),
        )
        self._recombination = params.real(recombination, from_=0.0, to=1.0)
Пример #20
0
 def test_mutation_range(arg, low=0.0):
     return params.real(arg, from_=low, to=2.0)
Пример #21
0
    def __init__(
        self,
        rng: int = None,
        uncertainties: Optional[str] = None,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: Optional[int] = None,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = "auto",
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        bootstrap: bool = True,
        n_jobs: Optional[int] = None,
        ccp_alpha: float = 0.0,
        max_samples: Optional[Union[int, float]] = None,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; choices are
                None; by default, RandomForestRegressor does not return predictive uncertainties;
                "naive"; uses the ensembles standard deviation
            n_estimators: number of decision trees
            criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            bootstrap: if False, the whole dataset is used to build trees
            n_jobs: number of parallel jobs; -1 to use all available processors; None means 1
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            max_samples: number of input samples to draw during bootstrap; integers directly specify the number,
                floating point values specify which fraction of samples to use; all by default

        The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(rng=rng, **kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None, "naive"})

        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion, {"mse", "mae"})
        max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(
            max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none
        )
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        bootstrap = params.boolean(bootstrap)
        n_jobs = params.any_(
            n_jobs,
            lambda arg: params.integer(arg, from_=-1, to=-1),
            lambda arg: params.integer(arg, from_=1),
            params.none,
        )
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        max_samples = params.any_(
            max_samples,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, from_=0.0, to=1.0),
            params.none,
        )

        self._model = ExtraTreesRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
        )