Exemplo n.º 1
0
    def axes_scales(self, scales=(None, None), **kwargs):
        """Set axes scales.

        Parameters:
            axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes;
                None indicates to use the current value

        Examples:
            axes_scales = (None, "log")  # change only vertical axis
        """

        scale_or_none_f = lambda arg: params.any_(
            arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none
        )
        scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None)

        # re-assign tuple as a whole
        self._scales = (
            self.axes_scales[0] if scales[0] is None else scales[0],
            self.axes_scales[1] if scales[1] is None else scales[1],
        )

        # set axes if specified (not None)
        # this allows to pass kwargs specific to one axis
        if scales[0] is not None:
            self.ax.set_xscale(scales[0], **kwargs)
        if scales[1] is not None:
            self.ax.set_yscale(scales[1], **kwargs)
Exemplo n.º 2
0
    def __init__(
        self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs
    ):
        """Initialize generalized function plot.

        Parameters:
            visualization_type: how to visualize generalized functions.
                Either single value or list of appropriate length.
                Possible values: "points" (default), "box-whisker", "shaded-line"
            rectify: whether and by how much each curves' values will be horizontally displaced
                to visually disentangle markers from different curves at the same location.
                True indicates automatic displacement, False indicates no displacement.
                If not specified, horizontal axis positions are not modified (default).
                If the horizontal axis scaling is logarithmic, the rectification factor
                is applied in log-space.

        Examples:
            # show three curves with automatic horizontal rectification
            __init__(visualization_type=("points", "points", "box-whisker"), rectify=True)
        """

        super().__init__(**kwargs)

        # parameter validation

        enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"})
        self._visualization_type = params.any_(
            visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f)
        )
        # arity can only be tested in evaluate()

        self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
Exemplo n.º 3
0
    def __init__(
        self,
        target=None,
        configuration: Optional[PlotConfiguration] = None,
        axes_labels=(None, None, None, None),
        axes_scales=("linear", "linear"),
        **kwargs,
    ):
        """Initialize Evaluation.

        Parameters:
            target: rendering target that evaluation outcome is rendered to;
                can be a single filename, or a matplotlib Axes or (Figure, Axes) pair,
                or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair,
                evaluation will add to it; if None, a new rendering target is created
            configuration: optional plot configuration controlling rendering details
            axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis;
                         for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid
            axes_scales: scales ("linear" or "log") for horizontal and vertical axes

        Examples:
            __init__(axes_labels=("bottom", "left", "top"))  # right is None
            __init__(axes_scales=("log", "log"))
        """

        configuration = params.any_(
            configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none
        )

        super().__init__(configuration=configuration, **kwargs)

        # Axes, (Figure, Axes), filename, None, or sequence (without None)
        target_f = lambda arg: params.any_(
            arg,
            lambda arg: params.instance(arg, mpl.axes.Axes),
            lambda arg: params.tuple_(
                arg,
                lambda arg: params.instance(arg, mpl.figure.Figure),
                lambda arg: params.instance(arg, mpl.axes.Axes),
                arity=2,
            ),
            params.string,
        )
        self._target = params.any_(
            target, target_f, params.none, lambda arg: params.tuple_(arg, target_f)
        )

        self._axes_labels = params.tuple_(
            axes_labels,
            lambda arg: params.any_(arg, params.string, params.none),
            arity=4,
            default=None,
        )

        self._axes_scales = params.tuple_(
            axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2
        )

        self._figaxis = None
Exemplo n.º 4
0
    def __init__(self, target: float, goal: str = "maximize", **kwargs):
        super().__init__(**kwargs)

        self._target = params.real(target)
        goal = params.enumeration(goal, {"maximize", "minimize"})
        if goal == "maximize":
            self._direction = 1
        elif goal == "minimize":
            self._direction = -1
Exemplo n.º 5
0
    def __init__(
        self,
        rng: int = None,
        num_seeds: int = 1,
        resolution: int = 64,
        max_relative_jump: float = 1.0,
        dimensions_varied: Union[str, float, int] = "all",
        max_iters: Optional[int] = None,
        max_evals: Optional[int] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            rng: pseudo-random number generator seed
            num_seeds: the number of starting points, and the number of points chosen at the end
                of each iteration
            resolution: the number of points to sample along a single dimension for a single seed
            max_relative_jump: the maximum relative step size along a single dimension. If a given
                dimension has length `L` and a seed has value `x` along that dimension, then the
                candidates are `resolution` linearly spaced points from the range
                [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds).
                `max_relative_jump must be on (0, 1].
                For a value of 1, the entire range is always considered.
            dimensions_varied: how many randomly selected dimensions to explore with each step.
                'all' indicates all dimensions. An integer directly specifies the number of
                dimensions. A float on (0, 1) indicates the fractional number of the total.
            max_iters: the maximum number of iterations
            max_evals: the maximum number of function evaluations (this is a soft maximum:
                once it is reached then the current iteration finishes)

        TODO: add tolerance stopping conditions
        """
        super().__init__(rng=rng, **kwargs)

        self._num_seeds = params.integer(num_seeds, from_=1)
        self._resolution = params.integer(resolution, from_=2)
        self._max_relative_jump = params.real(max_relative_jump,
                                              above=0.0,
                                              to=1.0)
        self._dimensions_varied = params.any_(
            dimensions_varied,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, below=1.0),
            lambda arg: params.enumeration(arg, {"all"}),
        )
        self._max_iters = params.optional_(
            max_iters, lambda arg: params.integer(arg, from_=1))
        self._max_evals = params.optional_(
            max_evals, lambda arg: params.integer(arg, from_=1))
        if self._max_iters is None and self._max_evals is None:
            raise InvalidParameterError(
                "at least one stopping condition defined", "all Nones")
Exemplo n.º 6
0
def element_data(element, property_):
    """Query chemical element data.

    Parameters:
        element: chemical element, given by either proton number (int) or abbreviation (str)
        property_: queried property; one of 'abbreviation', 'Z' (proton number)

    Returns:
        queried property

    Raises:
        InvalidParameterError: for invalid parameters
    """

    element = params.chemical_element(element)
    property_ = params.enumeration(property_, {"Z", "abbreviation"})
    return _element_data[element][property_]
Exemplo n.º 7
0
    def __init__(self, orient=None, **kwargs):
        """Initialize state.

        Parameters:
            orient: actively orients metric towards minimization (-1) or maximization (+1)
                    if unspecified, the natural orientation of the metric is retained

        Raises:
            InvalidParameterError if trying to orient a metric with no natural orientation
        """

        super().__init__(**kwargs)

        orient = params.enumeration(orient, {-1, +1, None})

        self._sign = +1  # default value leaves _evaluate() unchanged
        if orient is not None:
            if not self.has_orientation:
                raise InvalidParameterError("oriented metric",
                                            self.orientation)
            # -1 if desired and actual orientation disagree, otherwise +1
            self._sign = orient * self.orientation
Exemplo n.º 8
0
    def __init__(
        self,
        fits: bool = True,
        fit_lambda: float = 1e-7,
        fit_weights: Optional[str] = None,
        base=10,
        **kwargs,
    ):
        """Initialize learning curve plot.

        Parameters:
            fits: if True, show estimated asymptotic fits
            fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7
            fit_weights: if and how to weight fits; one of
                None: no weighting, "variance": weigh by variance for each training set size
            base: base for logarithmic plotting
            All parameters from base classes, in particular GeneralizedFunctionPlot and Plot.
        """

        # set learning curve-specific arguments if not explicitly set
        kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("training set size", "evaluation metric", None, None)
        )

        super().__init__(**kwargs)

        # parameters
        self._fits = params.boolean(fits)
        self._fit_lambda = params.real(fit_lambda, from_=0)
        self._fit_weights = params.any_(
            fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none
        )
        self._base = params.real(base, from_=2)

        self._logf = lambda x: np.log(x) / np.log(self._base)
        self._powf = lambda x: np.power(self._base, x)
Exemplo n.º 9
0
    def evaluate(self, results, **kwargs):
        """Compute plot data for multiple generalized (set-valued) functions.

        Multiple curves C_1, ..., C_k can be drawn.
        Each curve C_i is specified by a non-empty sequence of 2-tuples,
        where the first value is location on horizontal axis, and the
        other value is a sequence of locations on the vertical axis.

        Each curve can be drawn in a different way (points, box-whisker).

        Parameters:
            results: sequence of generalized functions data (curve data).
                     Each datum is a sequence of tuples (x,fx), where
                     x is a real number and fx is a sequence of real numbers.

        Examples:
            # two curves sharing one horizontal location
            evaluate([
                [(1,(1,0.9,1.1)), (3,(2,))],  # curve 1
                [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2
            ])
        """

        super().evaluate(results=results, **kwargs)

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        # _rectify evaluates to True if True or if > 0
        if len(results) > len(self.RECTIFY_DELTAS) and self._rectify:
            raise InvalidParameterError(
                f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves"
            )

        # finalize parameter validation for visualization_type
        if not is_sequence(self._visualization_type):
            self._visualization_type = (self._visualization_type,) * len(results)
        self._visualization_type = params.tuple_(
            self._visualization_type,
            lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}),
            arity=len(results),
            default="points",
        )

        # prepare plot

        # determine all distinct horizontal positons in the results data
        all_positions = np.unique([entry[0] for curve in results for entry in curve])

        # there is nothing to do without data to plot
        if len(all_positions) == 0:
            self._plotdata = []
            return

        # do not rectify if there is only a single horizontal position
        if len(all_positions) == 1 or self._rectify is False:
            self._rectify = 0.0

        # automatic determination of horizontal rectification factor
        #
        # the correct way to draw box-plots on a logarithmic horizontal axis is to have
        # different left-width and right-width of the boxes. However, matplotlib does not
        # support this. Because box widths are small compared to horizontal plot range,
        # it suffices to use the sum of left- and right-half widths.
        between_groups_spacing = 0.4
        in_group_spacing = 0.9  # box-whisker plots
        if self.axes_scales[0] == "linear":
            logf = lambda arg: arg
            powf = lambda arg: arg
        elif self.axes_scales[0] == "log":
            base = 10
            logf = lambda arg: np.log(arg) / np.log(base)
            powf = lambda arg: np.power(base, arg)

        if self._rectify is True:
            # diff(...) requires at least two horizontal locations; this is ensured above
            self._rectify = (
                between_groups_spacing * min(np.diff(logf(all_positions))) / len(results)
            )

        # determine positions
        self._plotdata = [None] * len(results)
        deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results))
        for (i, curve) in enumerate(results):
            # point markers, every single point is drawn
            if self._visualization_type[i] == "points":
                positions = powf(
                    np.hstack(
                        [
                            logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2
                            for entry in curve
                        ]
                    )
                )
                values = np.hstack([entry[1] for entry in curve])
                self._plotdata[i] = np.transpose([positions, values])
            # box-whisker plots
            elif self._visualization_type[i] == "box-whisker":
                positions = np.asfarray(
                    [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve]
                )
                values = [entry[1] for entry in curve]
                # can't use rectify for width if 0; 1 is a wild guess
                # todo: if plot ranges have been set, a better default value could
                #       be 10% of horizontal plot range
                w = 1 if not self._rectify else self._rectify
                widths = powf((positions + w / 2) * in_group_spacing) - powf(
                    (positions - w / 2) * in_group_spacing
                )
                positions = powf(positions)
                self._plotdata[i] = (positions, values, widths)
            elif self._visualization_type[i] == "shaded-line":
                positions = np.asfarray([entry[0] for entry in curve])
                values = [entry[1] for entry in curve]
                self._plotdata[i] = (positions, values)
            else:
                raise BenchmarkError("internal error, unknown visualization type")
Exemplo n.º 10
0
    def __init__(
        self,
        select: Optional[Sequence[str]] = None,
        failmode="raise",
        samplef: Callable[[Any], Any] = lambda arg: arg,
        java_gateway: Optional[CdkJavaGateway] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            select: which features to compute (by default, all). List of names, order matters.
                Presets are available as class constants:
                PRESET_ALL: all features
                PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail
                    often (tested on QM9 and CEP datasets; see accompanying notebook)
            failmode: how to handle failed descriptor calculations, either due to rejected SMILES
                encodings or failing descriptor code. Possible values:
                "raise" [default]: raise a Benchmarexception
                "drop": drop the sample. Returned Data will have fewer samples
                ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will
                    be set to False for failures
                ("index", index): where `index` is an empty list to which the indices of failed
                    entries will be appended
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            java_gateway: a gateway to a Java virtual machine

        Requires a CDK jar.
        """

        super().__init__(**kwargs)

        # parameters
        select = params.optional_(
            select,
            lambda arg: params.tuple_(
                arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys())
            ),
        )
        select = self.PRESET_ALL if select is None else select
        self._failmode = DataTransformationFailureMode.failmode(failmode)
        self._samplef = params.callable(samplef, num_pos_or_kw=1)
        self._java_gateway = params.optional_(
            java_gateway, lambda arg: params.instance(arg, JavaGateway)
        )
        if self._java_gateway is None:
            self._java_gateway = CdkJavaGateway()
        self._java_gateway = self._java_gateway.gateway

        # set up descriptors
        self._descriptors = tuple(
            eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select
        )

        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        for descriptor in self._descriptors:
            descriptor.initialise(builder)

        self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
Exemplo n.º 11
0
    def __init__(
        self,
        rng: int = None,
        uncertainties: Optional[str] = None,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: Optional[int] = None,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = "auto",
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        bootstrap: bool = True,
        n_jobs: Optional[int] = None,
        ccp_alpha: float = 0.0,
        max_samples: Optional[Union[int, float]] = None,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; choices are
                None; by default, RandomForestRegressor does not return predictive uncertainties;
                "naive"; uses the ensembles standard deviation
            n_estimators: number of decision trees
            criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            bootstrap: if False, the whole dataset is used to build trees
            n_jobs: number of parallel jobs; -1 to use all available processors; None means 1
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            max_samples: number of input samples to draw during bootstrap; integers directly specify the number,
                floating point values specify which fraction of samples to use; all by default

        The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(rng=rng, **kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None, "naive"})

        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion, {"mse", "mae"})
        max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(
            max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none
        )
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        bootstrap = params.boolean(bootstrap)
        n_jobs = params.any_(
            n_jobs,
            lambda arg: params.integer(arg, from_=-1, to=-1),
            lambda arg: params.integer(arg, from_=1),
            params.none,
        )
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        max_samples = params.any_(
            max_samples,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, from_=0.0, to=1.0),
            params.none,
        )

        self._model = ExtraTreesRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
        )
Exemplo n.º 12
0
    def __init__(self,
                 rng: int = None,
                 strategy: str = "best1bin",
                 maxiter: int = 1000,
                 popsize: int = 15,
                 tol: float = 0.01,
                 mutation=(0.5, 1),
                 recombination: float = 0.7,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            strategy: The differential evolution strategy to use. See documentation for complete
                list and explanations.
            maxiter: The maximum number of generations over which the entire population is evolved.
            popsize: A multiplier for setting the total population size.
            tol: Relative tolerance for convergence.
            mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max)
                in which case the mutation constant is randomly selected uniformly from between
                min and max with each generation.
            recombination: The recombination constant. Must be between 0 and 1.

        """
        super().__init__(rng=rng, **kwargs)

        allowed_strategies = {
            "best1bin",
            "best1exp",
            "rand1exp",
            "randtobest1exp",
            "currenttobest1exp",
            "best2exp",
            "rand2exp",
            "randtobest1bin",
            "currenttobest1bin",
            "best2bin",
            "rand2bin",
            "rand1bin",
        }
        self._strategy = params.enumeration(strategy, allowed_strategies)

        self._maxiter = params.integer(maxiter, from_=1)
        self._popsize = params.integer(popsize, from_=1)
        self._tol = params.real(tol, above=0.0)

        def test_mutation_range(arg, low=0.0):
            return params.real(arg, from_=low, to=2.0)

        self._mutation = params.any_(
            mutation,
            test_mutation_range,
            lambda pair: params.tuple_(
                pair,
                test_mutation_range,
                lambda arg2: test_mutation_range(arg2, low=pair[0]),
                arity=2,
            ),
        )
        self._recombination = params.real(recombination, from_=0.0, to=1.0)
Exemplo n.º 13
0
    def __init__(
            self,
            select: Union[str, Sequence[str]] = "all",
            samplef: Callable[[Any], Any] = lambda arg: arg,
            stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10),
            elemental_preset: str = "magpie",
            ionic_fast: bool = False,
            valence_orbitals: Sequence[str] = ("s", "p", "d", "f"),
            valence_props: Sequence[str] = ("avg", "frac"),
            **kwargs,
    ):
        """Initialize state.

        Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty,
        IonProperty, ValenceOrbital can be passed through. These parameters are prefixed
        with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list
        is the p_list parameter of Stoichiometry. For further details on these, see
        https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py

        Parameters:
            select: which feature sets to compute (by default, all). Specifying
                multiple sets (e.g., ('stoichiometry', 'elemental') selects both).
                Valid choices:
                'all': all features
                'stoichiometry': norms of stoichiometric features
                'elemental': element properties
                'ionic': ion properties
                'valence': valence orbital shell features
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            stoichiometry_p_list: list of L_p norms to compute
            elemental_preset: matminer preset to use. Valid choices include:
                'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el'
            ionic_fast: if True, assumes that elements exist in single oxidation state
            valence_orbitals: which valence orbitals to consider
            valence_props: whether to return average properties, fractional, or both

        Requires the matminer package (see file documentation).
        """

        super().__init__(**kwargs)

        SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence")

        if select == "all":
            select = SELECT_SETS
        if isinstance(select, str):
            select = (select,
                      )  # tuple(str,) yields tuple of characters in str
        select = params.tuple_(
            select,
            lambda arg: params.enumeration(arg, set(SELECT_SETS)),
        )

        self._stoichiometry_p_list = params.tuple_(
            stoichiometry_p_list, lambda p: params.integer(p, from_=0))
        self._elemental_preset = params.enumeration(
            elemental_preset,
            {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"})
        self._ionic_fast = params.boolean(ionic_fast)
        self._valence_orbitals = params.tuple_(
            valence_orbitals,
            lambda arg: params.enumeration(arg, {"s", "p", "d", "f"}))
        self._valence_props = params.tuple_(
            valence_props,
            lambda arg: params.enumeration(arg, {"avg", "frac"}))

        self.samplef = samplef  # todo: add callable to params

        # set up matminer
        try:
            import matminer
            import matminer.featurizers
            import matminer.featurizers.base
            import matminer.featurizers.composition
            import matminer.featurizers.conversions
            import pymatgen
        except ModuleNotFoundError as e:
            raise BenchmarkError(
                f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages"
            ) from e

        self._composition = pymatgen.core.composition.Composition

        # set up features
        features = []
        if "stoichiometry" in select:
            features.append(
                matminer.featurizers.composition.Stoichiometry(
                    p_list=self._stoichiometry_p_list))
        if "elemental" in select:
            features.append(
                matminer.featurizers.composition.ElementProperty.from_preset(
                    self._elemental_preset))
        if "ionic" in select:
            features.append(
                matminer.featurizers.composition.IonProperty(
                    fast=self._ionic_fast))
        if "valence" in select:
            features.append(
                matminer.featurizers.composition.ValenceOrbital(
                    orbitals=self._valence_orbitals,
                    props=self._valence_props))

        self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer(
            features)
Exemplo n.º 14
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Exemplo n.º 15
0
    def __init__(
        self,
        uncertainties: Optional[str] = None,
        loss: str = "ls",
        alpha: float = 0.9,
        learning_rate: float = 0.1,
        subsample: float = 1.0,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: int = 3,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = None,
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        random_state: int = None,
        ccp_alpha: float = 0.0,
        init: Optional[Any] = None,
        validation_fraction: float = 0.1,
        n_iter_no_change: Optional[int] = None,
        tol: float = 0.0001,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; possible choices are
                None; by default, RandomForestRegressor does not return any predictive uncertainties;
            loss: loss function to optimize; valid values are "ls" (least squares), "lad" (least absolute deviation),
                "huber" (Huber's loss), "quantile" (quantile regression). Use alpha parameter for huber and quantile.
            alpha: quantile for "huber" and "quantile" loss functions
            learning_rate: value by which to shrink contribution of consecutive trees; trade-off with num_estimators
            subsample: fraction of samples for fitting base learners; if <1 results in Stochastic Gradient Boosting.
                reducing subsample reduces variance and increases bias.
            n_estimators: number of decision trees
            criterion: either Friedman improved score ("friedman_rmse"), variance reduction ("mse", mean squared error),
                or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is 3
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            random_state: pseudo-random number generator seed
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            init: estimator for initial predictions; can be 'zero' for constant zero predictions
            validation_fraction: fraction of training data to set aside for early stopping; only with n_iter_no_change
            n_iter_no_change: set to integer to stop after no improvement (beyond tol) for that many rounds
            tol: tolerance for early stopping; only improvements larger than tol are considered

        The sklearn.GradientBoostingRegressor parameters `oob_score`, `verbose`, `warm_start` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(**kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None})

        loss = params.enumeration(loss, {"ls", "lad", "huber", "quantile"})
        alpha = params.real(alpha, above=0, below=1)
        learning_rate = params.real(learning_rate, above=0, to=1)
        subsample = params.real(subsample, above=0, to=1)
        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion,
                                       {"friedman_rmse", "mse", "mae"})
        max_depth = params.any_(max_depth,
                                lambda arg: params.integer(arg, from_=1),
                                params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf,
                                               from_=0.0,
                                               to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(max_leaf_nodes,
                                     lambda arg: params.integer(arg, from_=1),
                                     params.none)
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        random_state = params.integer(random_state)
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        # no validation for init (no class signature validator)
        validation_fraction = params.real(validation_fraction,
                                          above=0,
                                          below=1)
        n_iter_no_change = params.any_(
            n_iter_no_change, lambda arg: params.integer(arg, from_=0),
            params.none)
        tol = params.real(tol, from_=0)

        self._model = skl.ensemble.GradientBoostingRegressor(
            loss=loss,
            alpha=alpha,
            learning_rate=learning_rate,
            subsample=subsample,
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            random_state=random_state,
            ccp_alpha=ccp_alpha,
            init=init,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
        )