def __init__( self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs ): """Initialize generalized function plot. Parameters: visualization_type: how to visualize generalized functions. Either single value or list of appropriate length. Possible values: "points" (default), "box-whisker", "shaded-line" rectify: whether and by how much each curves' values will be horizontally displaced to visually disentangle markers from different curves at the same location. True indicates automatic displacement, False indicates no displacement. If not specified, horizontal axis positions are not modified (default). If the horizontal axis scaling is logarithmic, the rectification factor is applied in log-space. Examples: # show three curves with automatic horizontal rectification __init__(visualization_type=("points", "points", "box-whisker"), rectify=True) """ super().__init__(**kwargs) # parameter validation enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}) self._visualization_type = params.any_( visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f) ) # arity can only be tested in evaluate() self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
def __init__(self, internal_hp_optimization: bool = True, kernel: Optional[Kernel] = None, alpha: Union[float, Sequence] = 1e-5, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, normalize_y=False, random_state: int = None, **kwargs): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: internal_hp_optimization: if True, hyperparameters are optimized "internally" by the Gaussian process, that is, scikit-learn optimizes hyperparameters and for smlb the learner has no hyperparameters; if False, hyperparameters are optimized by smlb (and scikit-learn does not optimize any hyperparameters) kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal. Equivalent to adding a "WhiteKernel"; the default is the corresponding value from scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor. optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True normalize_y: whether to subtract the mean of the labels random_state: integer seed See skl.gaussian_process.GaussianProcessRegressor parameters. """ super().__init__(**kwargs) internal_hp_optimization = params.boolean(internal_hp_optimization) kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel), params.none) # incomplete check for alpha as dimension becomes known only at fitting time alpha = params.any_( alpha, lambda arg: params.real(arg, from_=0), lambda arg: params.real_vector(arg, domain=[0, np.inf]), ) # todo: check optimizer, requires params.union (of string and callable) and params.function normalize_y = params.boolean(normalize_y) random_state = params.integer(random_state) if kernel is None: kernel = skl.gaussian_process.kernels.RBF( ) + skl.gaussian_process.kernels.WhiteKernel() assert internal_hp_optimization is True # external HP optimization not yet supported self._model = skl.gaussian_process.GaussianProcessRegressor( kernel=kernel, alpha=alpha, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, normalize_y=normalize_y, random_state=random_state, )
def test_tuple_(): """Tests tuple_ meta test.""" testf = lambda arg: params.none(arg) # special case: no tuple with pytest.raises(InvalidParameterError): params.tuple_(None, lambda arg: arg) # special case: single test assert params.tuple_((None,), testf) == (None,) with pytest.raises(InvalidParameterError): params.any_("_", testf) # special case: 2-tuple assert params.tuple_((None, None), testf, testf) == (None, None) with pytest.raises(InvalidParameterError): params.tuple_(("_", None), testf, testf) with pytest.raises(InvalidParameterError): params.tuple_((None, "_"), testf, testf) with pytest.raises(InvalidParameterError): params.tuple_(("_", "_"), testf, testf) # arity parameter assert params.tuple_((None, None), testf, arity=2) with pytest.raises(InvalidParameterError): params.tuple_((None, None), testf, arity=3) with pytest.raises(InvalidParameterError): params.tuple_((None, None, None), testf, arity=2) # default parameter assert params.tuple_((None,), testf, arity=3, default=None) == (None, None, None) # no arity, no default assert params.tuple_((None, None, None), testf) == (None, None, None)
def __init__( self, target=None, configuration: Optional[PlotConfiguration] = None, axes_labels=(None, None, None, None), axes_scales=("linear", "linear"), **kwargs, ): """Initialize Evaluation. Parameters: target: rendering target that evaluation outcome is rendered to; can be a single filename, or a matplotlib Axes or (Figure, Axes) pair, or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair, evaluation will add to it; if None, a new rendering target is created configuration: optional plot configuration controlling rendering details axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis; for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid axes_scales: scales ("linear" or "log") for horizontal and vertical axes Examples: __init__(axes_labels=("bottom", "left", "top")) # right is None __init__(axes_scales=("log", "log")) """ configuration = params.any_( configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none ) super().__init__(configuration=configuration, **kwargs) # Axes, (Figure, Axes), filename, None, or sequence (without None) target_f = lambda arg: params.any_( arg, lambda arg: params.instance(arg, mpl.axes.Axes), lambda arg: params.tuple_( arg, lambda arg: params.instance(arg, mpl.figure.Figure), lambda arg: params.instance(arg, mpl.axes.Axes), arity=2, ), params.string, ) self._target = params.any_( target, target_f, params.none, lambda arg: params.tuple_(arg, target_f) ) self._axes_labels = params.tuple_( axes_labels, lambda arg: params.any_(arg, params.string, params.none), arity=4, default=None, ) self._axes_scales = params.tuple_( axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2 ) self._figaxis = None
def __init__(self, labels_to_load: Optional[Union[str, List[str]]] = None, ignore_dubious: bool = False): """Initialize Ni-superalloy dataset with specified labels. Parameters: labels_to_load (str or List[str]): which labels to load. Options are 'Yield Strength', 'Ultimate Tensile Strength', 'Stress Rupture Time', 'Stress Rupture Stress', and 'Elongation'. If None, then all labels are loaded. ignore_dubious: whether or not to ignore samples that have something questionable about them """ labels_to_load = params.optional_( labels_to_load, lambda arg: params.any_( arg, params.string, lambda arg: params.sequence(arg, type_=str), ), ) ignore_dubious = params.boolean(ignore_dubious) filepath = self.DEFAULT_PATH data, labels = self._load_data_and_labels(filepath, labels_to_load, ignore_dubious) super().__init__(data=data, labels=labels)
def __init__(self, class_path: Optional[str] = None): """Initialize Java gateway. If derived class is initialized for the first time, start up JVM and create gateway. On subsequent initializations of derived class, the same gateway is used, except when a different class_path is passed. In that case, the JVM is shut down and restarted with the new class path. Parameters: class_path: local filesystem class path containing one or more directories or .jar files. If not specified, an empty string is passed as classpath to the JVM. Raises: BenchmarkError if the class_path is invalid. """ # todo: class_path = params.optional_(class_path, params.string) class_path = params.any_(class_path, params.string, params.none) if self.__class__._gateway is None: # first time derived class is instantiated, create gateway self._launch_gateway(class_path=class_path) elif self.__class__._class_path != class_path: # if parameters changed, restart the JVM self._shutdown_gateway() self._launch_gateway(class_path=class_path) else: # subsequent instantiations use the same gateway pass
def axes_labels(self, labels=(None, None, None, None), **kwargs): """Set axes labels. Parameters: axes_labels: labels for bottom, left, top, right axes None indicates to use the current value Examples: axes_labels = (None, "y") # set only left axis label """ string_or_none_f = lambda arg: params.any_(arg, params.string, params.none) labels = params.tuple_(labels, string_or_none_f, arity=4, default=None) # re-assign tuple as a whole self._labels = tuple( self.axes_labels[i] if labels[i] is None else labels[i] for i in range(4) ) # set labels if specified (not None) # this allows to pass kwargs specific to one axis if labels[0] is not None: self.ax.set_xlabel(labels[0], fontdict=self._fontdict(), **kwargs) if labels[1] is not None: self.ax.set_ylabel(labels[1], fontdict=self._fontdict(), **kwargs) if labels[2] is not None or labels[3] is not None: # todo; possible implementation via xtwin/ytwin, storing these axes in outcome raise NotImplementedError
def axes_scales(self, scales=(None, None), **kwargs): """Set axes scales. Parameters: axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes; None indicates to use the current value Examples: axes_scales = (None, "log") # change only vertical axis """ scale_or_none_f = lambda arg: params.any_( arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none ) scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None) # re-assign tuple as a whole self._scales = ( self.axes_scales[0] if scales[0] is None else scales[0], self.axes_scales[1] if scales[1] is None else scales[1], ) # set axes if specified (not None) # this allows to pass kwargs specific to one axis if scales[0] is not None: self.ax.set_xscale(scales[0], **kwargs) if scales[1] is not None: self.ax.set_yscale(scales[1], **kwargs)
def __init__( self, source: str, join: Optional[Union[str, bool]] = None, **kwargs, ): """Loads dataset. All `IndexedFiniteLabeledDataPandasBackend.__init__` keyword arguments can be passed, in particular join, filterf, samplef, and labelf. See there for further explanation. Parameters: source: path to underlying data file (see class docstring); accepts both .csv and .csv.zip versions join: whether to join entries with the same chemical sum formula; this changes labels from single numbers to varying-length sequences of numbers. True can be passed to join by stoichiometry. filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing All samples have these keys: id: unique identifier (integer) SMILES: SMILES encoding formula: stoichiometric formula All labels have these keys: mass: weight of molecule PCE: power conversion efficiency VOC: open circuit voltage JSC: short-circuit current density H**O: highest occupied molecular orbital gap: LUMO-H**O LUMO: lowest unoccupied molecular orbital The identifiers and SMILES strings are unique. Stoichiometries are not (10,474 unique ones). Raises: InvalidParameterError: on invalid parameter values """ join = params.any_(join, params.string, params.boolean, params.none) # parse boolean settings for join if join is True: join = "formula" if join is False: join = None data, labels = self._load_data(source) super().__init__(data=data, labels=labels, join=join, **kwargs)
def _indices_testf(self, indices: Sequence[Any]): return params.optional_( indices, lambda arg: list( params.any_( # NumPy indexing expects a list arg, lambda arg: params.tuple_(arg, None, arity=0), # empty set lambda arg: params.tuple_( arg, lambda arg: params.integer( arg, from_=0, below=self.num_samples)), )), )
def __init__( self, rng: int = None, num_seeds: int = 1, resolution: int = 64, max_relative_jump: float = 1.0, dimensions_varied: Union[str, float, int] = "all", max_iters: Optional[int] = None, max_evals: Optional[int] = None, **kwargs, ): """Initialize state. Parameters: rng: pseudo-random number generator seed num_seeds: the number of starting points, and the number of points chosen at the end of each iteration resolution: the number of points to sample along a single dimension for a single seed max_relative_jump: the maximum relative step size along a single dimension. If a given dimension has length `L` and a seed has value `x` along that dimension, then the candidates are `resolution` linearly spaced points from the range [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds). `max_relative_jump must be on (0, 1]. For a value of 1, the entire range is always considered. dimensions_varied: how many randomly selected dimensions to explore with each step. 'all' indicates all dimensions. An integer directly specifies the number of dimensions. A float on (0, 1) indicates the fractional number of the total. max_iters: the maximum number of iterations max_evals: the maximum number of function evaluations (this is a soft maximum: once it is reached then the current iteration finishes) TODO: add tolerance stopping conditions """ super().__init__(rng=rng, **kwargs) self._num_seeds = params.integer(num_seeds, from_=1) self._resolution = params.integer(resolution, from_=2) self._max_relative_jump = params.real(max_relative_jump, above=0.0, to=1.0) self._dimensions_varied = params.any_( dimensions_varied, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, below=1.0), lambda arg: params.enumeration(arg, {"all"}), ) self._max_iters = params.optional_( max_iters, lambda arg: params.integer(arg, from_=1)) self._max_evals = params.optional_( max_evals, lambda arg: params.integer(arg, from_=1)) if self._max_iters is None and self._max_evals is None: raise InvalidParameterError( "at least one stopping condition defined", "all Nones")
def __init__(self, configuration: Optional[EvaluationConfiguration] = None, **kwargs): """Initialize Evaluation. Parameters: configuration: optional configuration object controlling rendering details """ super().__init__(**kwargs) self._configuration = params.any_( configuration, lambda arg: params.instance(arg, EvaluationConfiguration), params.none ) if self._configuration is None: self._configuration = self._default_configuration() self._auxiliary = dict() # internal handle on optional auxiliary outcome data
def __init__(self, noise_part=None, signal_part=None, **kwargs): """Initialize decompositions. Parameters: noise_part: estimated noise distribution; the aleatoric component signal_part: estimated signal distribution; the epistemic component """ super().__init__(**kwargs) optional = lambda arg: params.any_( arg, lambda x: params.instance(x, PredictiveDistribution), params. none) self._noise_part = optional(noise_part) self._signal_part = optional(signal_part) pass
def __init__( self, input_: TabularData, output: PredictiveDistribution, scores: Sequence[float], **kwargs ): super().__init__(**kwargs) self._input: TabularData = params.instance(input_, TabularData) self._output: PredictiveDistribution = params.instance(output, PredictiveDistribution) # total number of function evaluations during this step self._num_evaluations: int = params.integer(self._input.num_samples, from_=1) self._scores: Sequence[float] = params.any_( scores, lambda arg: params.sequence(arg, length=1, type_=float), lambda arg: params.sequence(arg, length=self._num_evaluations, type_=float), )
def noise(self, shape=None): """Add Gaussian noise to labels. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing independent identically distributed Gaussian noise Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return self.random.normal(self._mean, self._stddev, size=shape)
def __init__(self, cdk_jar_path: Optional[str] = None): """Initialize CDK Java gateway. See base class JavaGateway for details. This class provides CDK-specific functionality, namely the path to the CDK .jar file. Parameters: cdk_jar_path: local filesystem path to the CDK jar, e.g., '/file/path/cdk.jar'. If not specified, smlb tries to find the CDK jar. Raises: BenchmarkError if the CDK .jar file can not be found. """ # todo: optional_ # cdk_jar_path = params.optional_(cdk_jar_path, params.string) todo: valid path cdk_jar_path = params.any_(cdk_jar_path, params.string, params.none) # finding CDK .jar file logic if cdk_jar_path is None: if self._cdk_jar_path_auto is not None: # already detected, use stored path cdk_jar_path = self._cdk_jar_path_auto else: # attempt to find CDK .jar file # todo: find correct path for installed versions path = os.path.join(os.path.dirname(__file__), "../build/cdk.jar") if not os.access(path, os.R_OK): raise BenchmarkError( "Valid path to .jar file", path, explanation=f"Jar file {path} does not exist or is not readable.", ) cdk_jar_path = path super().__init__(cdk_jar_path)
def noise(self, shape=None): """Return no noise. A constant value is returned. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing a constant value Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return np.full(shape, self._value)
def test_all_(): """Tests all_ meta test.""" # special case: single test assert params.any_(None, lambda arg: params.none(arg)) is None with pytest.raises(InvalidParameterError): params.all_("_", lambda arg: params.none(arg)) # special case: and assert ( params.all_( 2, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2) ) == 2 ) assert ( params.all_( 3, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2), lambda arg: params.integer(arg, from_=3), ) == 3 ) # fail in first testf with pytest.raises(InvalidParameterError): params.all_( 1, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2) ) # fail in last testf with pytest.raises(InvalidParameterError): params.all_( 2, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2), lambda arg: params.integer(arg, above=2), )
def __init__( self, fits: bool = True, fit_lambda: float = 1e-7, fit_weights: Optional[str] = None, base=10, **kwargs, ): """Initialize learning curve plot. Parameters: fits: if True, show estimated asymptotic fits fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7 fit_weights: if and how to weight fits; one of None: no weighting, "variance": weigh by variance for each training set size base: base for logarithmic plotting All parameters from base classes, in particular GeneralizedFunctionPlot and Plot. """ # set learning curve-specific arguments if not explicitly set kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log")) kwargs["axes_labels"] = kwargs.get( "axes_labels", ("training set size", "evaluation metric", None, None) ) super().__init__(**kwargs) # parameters self._fits = params.boolean(fits) self._fit_lambda = params.real(fit_lambda, from_=0) self._fit_weights = params.any_( fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none ) self._base = params.real(base, from_=2) self._logf = lambda x: np.log(x) / np.log(self._base) self._powf = lambda x: np.power(self._base, x)
def __init__( self, data: "pandas.DataFrame", # noqa F821 labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None, dtype: Optional[dict] = None, join: Optional[str] = None, filterf: Optional[Callable[[Any], bool]] = None, samplef: Optional[Callable[[Any], Any]] = None, labelf: Optional[Callable[[Any], Any]] = None, **kwargs, ): """Initialize dataset. Parameters control loading and preprocessing of the data. Order: 1. joining 2. filtering 3. sample and label transform Parameters: data: the samples in the form of a Pandas DataFrame. labels: the labels, either in the form of a Pandas DataFrame with same number of rows as data and different column names, or in the form of a list of column names, which are then split out from the data and used as labels. If not specified, the dataset is unlabeled. dtype: the NumPy data types to use for samples and labels, in the form of a dictionary with column names as keys and dtypes as values. Can be used to override dtype auto-detection for some or all columns. join: if specified, name of "column" to join by; this changes labels to be sequences of single-entry labels filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. If column names are given, they must be unique across data and labels, if any. """ import pandas as pd # only import if class is used # parameter validation data = params.instance(data, pd.DataFrame) labels = params.optional_( labels, lambda arg: params.any_( arg, lambda arg: params.instance(arg, pd.DataFrame ), # before tuple_ lambda arg: params.tuple_(arg, params.string), ), ) dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={}) join = params.optional_(join, params.string) singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1 ) # noqa: E731 filterf = params.optional_(filterf, singleargf) samplef = params.optional_(samplef, singleargf) labelf = params.optional_(labelf, singleargf) if labels is None and labelf: raise InvalidParameterError( "matching labels and label function", "label function specified for unlabeled data") # process data data = data.reset_index(drop=True) # if labels are given as separate DataFrame, join them if isinstance(labels, pd.DataFrame): if len(data) != len(labels): raise InvalidParameterError( "matching data and labels", f"different number of rows ({len(data)} != {len(labels)})", ) labels = labels.reset_index(drop=True) col_names = np.hstack((data.columns, labels.columns)) if len(col_names) != len(pd.unique(col_names)): raise InvalidParameterError( "unique column names", f"{data.columns.values} and {labels.columns.values}") data = pd.concat([data, labels], axis=1) labels = labels.columns.values # 1. optional joining if join: groups = data.groupby(join, sort=False, as_index=False) data = groups.aggregate(lambda tdf: tdf.tolist()) # 2. optional filtering if filterf: selection = data.apply(filterf, axis=1) data = data[selection] # split data and labels if labels is not None: # DataFrame column indexing requires list, not tuple data, labels = data.drop(columns=list(labels)), data[list(labels)] # 3. optional sample and label transform if samplef: data = data.apply(samplef, axis=1, result_type="reduce") if isinstance(data, pd.Series): data = pd.DataFrame(data, columns=["Samples"]) if labelf: labels = labels.apply(labelf, axis=1, result_type="reduce") if isinstance(labels, pd.Series): labels = pd.DataFrame(labels, columns=["Labels"]) # convert to NumPy structured array data = self._to_numpy(data, dtype=dtype) labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None super().__init__(data=data, labels=labels, **kwargs)
def __init__( self, uncertainties: Optional[str] = None, loss: str = "ls", alpha: float = 0.9, learning_rate: float = 0.1, subsample: float = 1.0, n_estimators: int = 100, criterion: str = "mse", max_depth: int = 3, min_samples_split: Union[int, float] = 2, min_samples_leaf: Union[int, float] = 1, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str, None] = None, max_leaf_nodes: Optional[int] = None, min_impurity_decrease: float = 0.0, # min_impurity_split deprecated random_state: int = None, ccp_alpha: float = 0.0, init: Optional[Any] = None, validation_fraction: float = 0.1, n_iter_no_change: Optional[int] = None, tol: float = 0.0001, **kwargs, ): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: uncertainties: whether and how to compute predictive uncertainties; possible choices are None; by default, RandomForestRegressor does not return any predictive uncertainties; loss: loss function to optimize; valid values are "ls" (least squares), "lad" (least absolute deviation), "huber" (Huber's loss), "quantile" (quantile regression). Use alpha parameter for huber and quantile. alpha: quantile for "huber" and "quantile" loss functions learning_rate: value by which to shrink contribution of consecutive trees; trade-off with num_estimators subsample: fraction of samples for fitting base learners; if <1 results in Stochastic Gradient Boosting. reducing subsample reduces variance and increases bias. n_estimators: number of decision trees criterion: either Friedman improved score ("friedman_rmse"), variance reduction ("mse", mean squared error), or, mean absolute error ("mae") max_depth: maximum depth of a tree; default is 3 min_samples_split: minimum number of samples required to split an internal node; float numbers indicate a fraction of number of training samples min_samples_leaf: minimum number of training samples required in a leaf node float numbers indicate a fraction of number of training samples min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node max_features: number of features considered when splitting; integers directly specify the number, floating point values specify which fraction of all features to use; "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features max_leaf_nodes: maximum number of leaves a tree can have min_impurity_decrease: minimum impurity decrease required for splitting random_state: pseudo-random number generator seed ccp_alpha: complexity parameter for minimal cost-complexity pruning. init: estimator for initial predictions; can be 'zero' for constant zero predictions validation_fraction: fraction of training data to set aside for early stopping; only with n_iter_no_change n_iter_no_change: set to integer to stop after no improvement (beyond tol) for that many rounds tol: tolerance for early stopping; only improvements larger than tol are considered The sklearn.GradientBoostingRegressor parameters `oob_score`, `verbose`, `warm_start` are not considered. See skl.ensemble.ExtraTreesRegressor parameters. """ super().__init__(**kwargs) # validate parameters self._uncertainties = params.enumeration(uncertainties, {None}) loss = params.enumeration(loss, {"ls", "lad", "huber", "quantile"}) alpha = params.real(alpha, above=0, below=1) learning_rate = params.real(learning_rate, above=0, to=1) subsample = params.real(subsample, above=0, to=1) n_estimators = params.integer(n_estimators, from_=1) criterion = params.enumeration(criterion, {"friedman_rmse", "mse", "mae"}) max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none) min_samples_split = params.any_( min_samples_split, lambda arg: params.integer(arg, from_=2), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_samples_leaf = params.any_( min_samples_leaf, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0) max_features = params.any_( max_features, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, to=1.0), lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}), params.none, ) max_leaf_nodes = params.any_(max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none) min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0) random_state = params.integer(random_state) ccp_alpha = params.real(ccp_alpha, from_=0.0) # no validation for init (no class signature validator) validation_fraction = params.real(validation_fraction, above=0, below=1) n_iter_no_change = params.any_( n_iter_no_change, lambda arg: params.integer(arg, from_=0), params.none) tol = params.real(tol, from_=0) self._model = skl.ensemble.GradientBoostingRegressor( loss=loss, alpha=alpha, learning_rate=learning_rate, subsample=subsample, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, init=init, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, )
def __init__( self, num_trees: int = -1, use_jackknife: bool = True, bias_learner: Optional[BaseLoloLearner] = None, leaf_learner: Optional[BaseLoloLearner] = None, subset_strategy: Union[str, int, float] = "auto", min_leaf_instances: int = 1, max_depth: int = 2 ** 30, uncertainty_calibration: bool = False, randomize_pivot_location: bool = False, # randomly_rotate_features: bool = False, currently in develop branch **kwargs ): """Initialize random forest model. See lolo Scala source code for initialization parameters: https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala When using `uncertainty_calibration=False` (the default), the number of trees `num_trees` should be set to a multiple of the number n of training samples, `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`, `num_trees = 64` is sufficient. Parameters: num_trees: number of trees in the forest; -1 uses number of training samples use_jackknife: whether to use jackknife-based variance estimates bias_learner: algorithm used to model bias leaf_learner: algorithm used at each leaf of the random forest subset_strategy: strategy to determine number of features used at each split "auto": use the default for lolo (all features for regression, sqrt for classification) "log2": use the base 2 log of the number of features "sqrt": use the square root of the number of features integer: set the number of features explicitly float: use a certain fraction of the features min_leaf_instances: minimum number of features used at each leaf max_depth: maximum depth of decision trees uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties based on out-of-bag residuals randomize_pivot_location: whether to draw pivots randomly or always select the midpoint randomly_rotate_features: whether to rotate real scalar fetures for each tree """ super().__init__(**kwargs) # validate parameters num_trees = params.any_( num_trees, lambda i: params.integer(i, above=0), lambda i: params.integer(i, from_=-1, to=-1), ) use_jackknife = params.boolean(use_jackknife) bias_learner = params.any_( bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) leaf_learner = params.any_( leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) subset_strategy = params.any_( subset_strategy, lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}), lambda s: params.integer(s, above=0), lambda s: params.real(s, above=0), ) min_leaf_instances = params.integer(min_leaf_instances, above=0) # the default 2**30 works for 32 bit or larger architectures max_depth = params.integer(max_depth, above=0) uncertainty_calibration = params.boolean(uncertainty_calibration) randomize_pivot_location = params.boolean(randomize_pivot_location) # randomly_rotate_features = params.boolean(randomly_rotate_features) # set up model try: self._model = RandomForestRegressor( num_trees=num_trees, use_jackknife=use_jackknife, bias_learner=bias_learner, leaf_learner=leaf_learner, subset_strategy=subset_strategy, min_leaf_instances=min_leaf_instances, max_depth=max_depth, uncertainty_calibration=uncertainty_calibration, randomize_pivot_location=randomize_pivot_location, # randomly_rotate_features=randomly_rotate_features, ) except Py4JJavaError as e: raise BenchmarkError("instantiating lolo model failed") from e self._with_uncertainties = use_jackknife # otherwise, deviations will be zero
def __init__(self, rng: int = None, strategy: str = "best1bin", maxiter: int = 1000, popsize: int = 15, tol: float = 0.01, mutation=(0.5, 1), recombination: float = 0.7, **kwargs): """Initialize state. Scipy-specific parameters are passed through. Parameters: rng: integer seed. Will be used to generate a new seed each time the optimizer is run. strategy: The differential evolution strategy to use. See documentation for complete list and explanations. maxiter: The maximum number of generations over which the entire population is evolved. popsize: A multiplier for setting the total population size. tol: Relative tolerance for convergence. mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max) in which case the mutation constant is randomly selected uniformly from between min and max with each generation. recombination: The recombination constant. Must be between 0 and 1. """ super().__init__(rng=rng, **kwargs) allowed_strategies = { "best1bin", "best1exp", "rand1exp", "randtobest1exp", "currenttobest1exp", "best2exp", "rand2exp", "randtobest1bin", "currenttobest1bin", "best2bin", "rand2bin", "rand1bin", } self._strategy = params.enumeration(strategy, allowed_strategies) self._maxiter = params.integer(maxiter, from_=1) self._popsize = params.integer(popsize, from_=1) self._tol = params.real(tol, above=0.0) def test_mutation_range(arg, low=0.0): return params.real(arg, from_=low, to=2.0) self._mutation = params.any_( mutation, test_mutation_range, lambda pair: params.tuple_( pair, test_mutation_range, lambda arg2: test_mutation_range(arg2, low=pair[0]), arity=2, ), ) self._recombination = params.real(recombination, from_=0.0, to=1.0)
def __init__( self, rng: int = None, uncertainties: Optional[str] = None, n_estimators: int = 100, criterion: str = "mse", max_depth: Optional[int] = None, min_samples_split: Union[int, float] = 2, min_samples_leaf: Union[int, float] = 1, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str, None] = "auto", max_leaf_nodes: Optional[int] = None, min_impurity_decrease: float = 0.0, # min_impurity_split deprecated bootstrap: bool = True, n_jobs: Optional[int] = None, ccp_alpha: float = 0.0, max_samples: Optional[Union[int, float]] = None, **kwargs, ): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: uncertainties: whether and how to compute predictive uncertainties; choices are None; by default, RandomForestRegressor does not return predictive uncertainties; "naive"; uses the ensembles standard deviation n_estimators: number of decision trees criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae") max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf min_samples_split: minimum number of samples required to split an internal node; float numbers indicate a fraction of number of training samples min_samples_leaf: minimum number of training samples required in a leaf node float numbers indicate a fraction of number of training samples min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node max_features: number of features considered when splitting; integers directly specify the number, floating point values specify which fraction of all features to use; "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features max_leaf_nodes: maximum number of leaves a tree can have min_impurity_decrease: minimum impurity decrease required for splitting bootstrap: if False, the whole dataset is used to build trees n_jobs: number of parallel jobs; -1 to use all available processors; None means 1 ccp_alpha: complexity parameter for minimal cost-complexity pruning. max_samples: number of input samples to draw during bootstrap; integers directly specify the number, floating point values specify which fraction of samples to use; all by default The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered. See skl.ensemble.ExtraTreesRegressor parameters. """ super().__init__(rng=rng, **kwargs) # validate parameters self._uncertainties = params.enumeration(uncertainties, {None, "naive"}) n_estimators = params.integer(n_estimators, from_=1) criterion = params.enumeration(criterion, {"mse", "mae"}) max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none) min_samples_split = params.any_( min_samples_split, lambda arg: params.integer(arg, from_=2), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_samples_leaf = params.any_( min_samples_leaf, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0) max_features = params.any_( max_features, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, to=1.0), lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}), params.none, ) max_leaf_nodes = params.any_( max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none ) min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0) bootstrap = params.boolean(bootstrap) n_jobs = params.any_( n_jobs, lambda arg: params.integer(arg, from_=-1, to=-1), lambda arg: params.integer(arg, from_=1), params.none, ) ccp_alpha = params.real(ccp_alpha, from_=0.0) max_samples = params.any_( max_samples, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, from_=0.0, to=1.0), params.none, ) self._model = ExtraTreesRegressor( n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, n_jobs=n_jobs, ccp_alpha=ccp_alpha, max_samples=max_samples, )
def test_any_(): """Tests any_ meta test.""" # special case: single test assert params.any_(None, lambda arg: params.none(arg)) is None with pytest.raises(InvalidParameterError): params.any_("_", lambda arg: params.none(arg)) # special case: or assert params.any_(None, lambda arg: params.none(arg), lambda arg: params.none(arg)) is None assert params.any_(None, lambda arg: params.none("_"), lambda arg: params.none(arg)) is None assert params.any_(None, lambda arg: params.none(arg), lambda arg: params.none("_")) is None with pytest.raises(InvalidParameterError): params.any_(None, lambda arg: params.none("_"), lambda arg: params.none("_")) # three tests assert ( params.any_( None, lambda arg: params.none(arg), lambda arg: params.none(arg), lambda arg: params.none(arg), ) is None ) assert ( params.any_( None, lambda arg: params.none(arg), lambda arg: params.none(arg), lambda arg: params.none("_"), ) is None ) assert ( params.any_( None, lambda arg: params.none(arg), lambda arg: params.none("_"), lambda arg: params.none(arg), ) is None ) assert ( params.any_( None, lambda arg: params.none(arg), lambda arg: params.none("_"), lambda arg: params.none("_"), ) is None ) assert ( params.any_( None, lambda arg: params.none("_"), lambda arg: params.none(arg), lambda arg: params.none(arg), ) is None ) assert ( params.any_( None, lambda arg: params.none("_"), lambda arg: params.none(arg), lambda arg: params.none("_"), ) is None ) assert ( params.any_( None, lambda arg: params.none("_"), lambda arg: params.none("_"), lambda arg: params.none(arg), ) is None ) with pytest.raises(InvalidParameterError): params.any_( None, lambda arg: params.none("_"), lambda arg: params.none("_"), lambda arg: params.none("_"), )