예제 #1
0
    def __init__(self,
                 data: np.ndarray,
                 labels: Optional[np.ndarray] = None,
                 **kwargs):
        """Initialize dataset.

        Parameters:
            data: tabular data as a NumPy ndarray
            labels: tabular data as a NumPy ndarray. If not specified,
                dataset is unlabeled.

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match.

        Examples:
            From numerical NumPy data:
            ```
            TabularData(numpy.ndarray(...), ...)
            ```

            From a Pandas DataFrame:
            ```
            df = pandas.DataFrame(..., columns=[...])
            TabularData(df.to_records(index=False), labels=...)
            ```

            From mixed NumPy data, with column names (note use of tuples):
            ```
            a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)])
            TabularData(a, ...)
            ```
        """

        # parameter validation
        data = params.instance(data, np.ndarray)
        labels = params.optional_(labels,
                                  lambda arg: params.instance(arg, np.ndarray))

        if labels is not None:
            # number of samples and labels must match
            if data.shape[0] != labels.shape[0]:
                raise InvalidParameterError(
                    "same number of samples and labels",
                    f"{data.shape[0]} samples, {labels.shape[0]} labels",
                )

            # uniqueness of "column" names, if any, is enforced by NumPy,
            # but only separately for data and labels
            if is_sequence(data.dtype.names) and is_sequence(
                    labels.dtype.names):
                column_names = data.dtype.names + labels.dtype.names
                if len(column_names) != len(np.unique(column_names)):
                    raise InvalidParameterError(
                        "unique column names for samples and labels",
                        column_names)

        self._data, self._labels = data, labels

        super().__init__(**kwargs)
예제 #2
0
    def __init__(
        self,
        target=None,
        configuration: Optional[PlotConfiguration] = None,
        axes_labels=(None, None, None, None),
        axes_scales=("linear", "linear"),
        **kwargs,
    ):
        """Initialize Evaluation.

        Parameters:
            target: rendering target that evaluation outcome is rendered to;
                can be a single filename, or a matplotlib Axes or (Figure, Axes) pair,
                or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair,
                evaluation will add to it; if None, a new rendering target is created
            configuration: optional plot configuration controlling rendering details
            axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis;
                         for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid
            axes_scales: scales ("linear" or "log") for horizontal and vertical axes

        Examples:
            __init__(axes_labels=("bottom", "left", "top"))  # right is None
            __init__(axes_scales=("log", "log"))
        """

        configuration = params.any_(
            configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none
        )

        super().__init__(configuration=configuration, **kwargs)

        # Axes, (Figure, Axes), filename, None, or sequence (without None)
        target_f = lambda arg: params.any_(
            arg,
            lambda arg: params.instance(arg, mpl.axes.Axes),
            lambda arg: params.tuple_(
                arg,
                lambda arg: params.instance(arg, mpl.figure.Figure),
                lambda arg: params.instance(arg, mpl.axes.Axes),
                arity=2,
            ),
            params.string,
        )
        self._target = params.any_(
            target, target_f, params.none, lambda arg: params.tuple_(arg, target_f)
        )

        self._axes_labels = params.tuple_(
            axes_labels,
            lambda arg: params.any_(arg, params.string, params.none),
            arity=4,
            default=None,
        )

        self._axes_scales = params.tuple_(
            axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2
        )

        self._figaxis = None
예제 #3
0
    def _intersection(lhs: "TabularData",
                      rhs: "TabularData",
                      duplicates: bool = False) -> "TabularData":
        """Specialized intersection.

        For labeled data, labels are compared as well.

        The datasets must be compatible in the sense that both are of type
        TabularData or derived, and either labeled or unlabeled.

        Parameters:
            lhs: one of the two datasets to intersect ('left hand side')
            rhs: one of the two datasets to intersect ('right hand side')
            duplicates: if False (default), the returned data do not contain
                duplicate entries; if True, duplicates are taken into account.
                Both inputs and labels have to match for duplicates.

        Returns:
            TabularData containing only samples in both datasets, either without duplicates
            (set intersection) or taking duplicates into account (multiset intersection)

        Raises:
            NotImplementedError if the set intersection can not be computed
        """

        # parameter validation
        lhs = params.instance(lhs, TabularData)
        rhs = params.instance(rhs, TabularData)
        duplicates = params.boolean(duplicates)

        # special case: empty set
        if lhs.num_samples == 0:
            return lhs.subset()  # copy
        if rhs.num_samples == 0:
            return rhs.subset()  # copy

        if lhs.is_labeled != rhs.is_labeled:
            raise InvalidParameterError("compatible TabularData",
                                        "mismatch in labeling")

        # intersection calculation
        _lhs, _rhs = TabularData._joint_data_labels(
            lhs), TabularData._joint_data_labels(rhs)

        if _lhs.dtype != _rhs.dtype:
            raise InvalidParameterError(
                "Matching TabularData",
                f"{_lhs.dtype.descr} and {_rhs.dtype.descr}")

        if duplicates is False:
            _, indices, _ = np.intersect1d(
                _lhs, _rhs, return_indices=True)  # drops any duplicates
            indices = np.sort(indices)  # restores original order
            return lhs.subset(indices)
        else:  # duplicates = True
            raise NotImplementedError(  # todo: implement
                "specialized multiset intersection not implemented for TabularData"
            )
예제 #4
0
파일: optimizer.py 프로젝트: syam-s/smlb
    def __init__(self, learner: Learner, scorer: Scorer, maximize: bool = True):
        self._learner = params.instance(learner, Learner)
        self._scorer = params.instance(scorer, Scorer)

        self._maximize = params.boolean(maximize)
        # If the goal is to maximize the score, invert the value because optimizers minimize.
        if self.maximize:
            self._direction = -1
        else:
            self._direction = 1

        self._steps = []
예제 #5
0
    def apply(self, data: Data) -> PredictiveDistribution:
        """Predicts new inputs.

        Parameters:
            data: finite indexed data to predict

        Returns:
            predictive normal distributions if predictive uncertainties were requested,
            otherwise delta distributions
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)

        xpred = params.real_matrix(data.samples())

        if self._with_uncertainties:
            try:
                preds, stddevs = self._model.predict(xpred, return_std=True)
                return NormalPredictiveDistribution(mean=preds, stddev=stddevs)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
        else:
            try:
                preds = self._model.predict(xpred, return_std=False)
                return DeltaPredictiveDistribution(mean=preds)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
예제 #6
0
    def __init__(self,
                 internal_hp_optimization: bool = True,
                 kernel: Optional[Kernel] = None,
                 alpha: Union[float, Sequence] = 1e-5,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 normalize_y=False,
                 random_state: int = None,
                 **kwargs):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            internal_hp_optimization: if True, hyperparameters are optimized "internally"
                by the Gaussian process, that is, scikit-learn optimizes hyperparameters
                and for smlb the learner has no hyperparameters;
                if False, hyperparameters are optimized by smlb (and scikit-learn does
                not optimize any hyperparameters)
            kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default
            alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal.
                   Equivalent to adding a "WhiteKernel"; the default is the corresponding value from
                   scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor.
            optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True
            n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True
            normalize_y: whether to subtract the mean of the labels
            random_state: integer seed

        See skl.gaussian_process.GaussianProcessRegressor parameters.
        """

        super().__init__(**kwargs)

        internal_hp_optimization = params.boolean(internal_hp_optimization)
        kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel),
                             params.none)
        # incomplete check for alpha as dimension becomes known only at fitting time
        alpha = params.any_(
            alpha,
            lambda arg: params.real(arg, from_=0),
            lambda arg: params.real_vector(arg, domain=[0, np.inf]),
        )
        # todo: check optimizer, requires params.union (of string and callable) and params.function
        normalize_y = params.boolean(normalize_y)
        random_state = params.integer(random_state)

        if kernel is None:
            kernel = skl.gaussian_process.kernels.RBF(
            ) + skl.gaussian_process.kernels.WhiteKernel()

        assert internal_hp_optimization is True  # external HP optimization not yet supported

        self._model = skl.gaussian_process.GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            random_state=random_state,
        )
예제 #7
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(data, Data)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            return NormalPredictiveDistribution(
                mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
            )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
예제 #8
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's RandomForestRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None and self._correlations is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            if self._correlations is None:
                return NormalPredictiveDistribution(
                    mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                )
            elif self._correlations == "naive":
                if (data.num_samples > 25000) and not self._force_corr:
                    warn(
                        "Input correlations requested for >2.5E4 predictions."
                        " Corelation matrix will not be computed, because a matrix this large may"
                        " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)."
                        " To force computation anyway, set `force_corr = True` in learner constructor.",
                        UserWarning,
                    )
                    return NormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                    )
                else:
                    # Must handle single-prediction separately, as in this case np.corrcoef
                    # will return single number rather than 1x1 array.
                    if preds.shape[1] == 1:
                        corr = np.array([[1]])
                    else:
                        corr = np.corrcoef(preds, rowvar=False)
                    return CorrelatedNormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr
                    )
            else:
                raise BenchmarkError(
                    "internal error, unknown parameter for correlations of RandomForestRegressionSklearn"
                )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn"
            )
예제 #9
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        For Gaussian processes, both the noise-free predictive (posterior)
        distribution as well as the noise estimate are normally distributed.
        The predictive distribution with noise is the sum of the former two.

        The $\alpha$ training noise specified at initialization time is not
        added at prediction time, and thus not part of the noise model.
        The current implementation considers contributions from any
        WhiteKernel or other kernel that has a hyperparameter 'noise_level'.

        Limitations:
            It is a currently accepted shortcoming that WhiteKernels that are
            not 'first-level' sum members might yield wrong noise models.
            Examples:
            WhiteKernel(...) + other kernels will work
            kernel(...) * WhiteKernel(...) will not work as intended

            Training data noise $\alpha$ is not added

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution with the following decomposition:
                predicted: sum of model and noise distribution
                noise_part: normal distribution for estimated noise
                signal_part: normal distribution for estimated model contribution;
                             the Gaussian process' "predictive variance";
                             depends only on distance from the training data
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())
        n = data.num_samples

        # predict
        preds, stddevs = self._model.predict(xpred, return_std=True)

        # noise
        # noise are all noise_level of WhiteKernel, where noise_level is variance (not standard deviation)
        # this assumes that the noise level are independent
        noise = tuple(v for k, v in self._model.kernel_.get_params().items()
                      if k.endswith("noise_level"))
        noise = np.ones(shape=n) * np.sum(noise)
        noise_part = NormalPredictiveDistribution(mean=np.zeros(shape=n),
                                                  stddev=np.sqrt(noise))

        return NormalPredictiveDistribution(
            mean=preds,
            stddev=np.sqrt(np.square(stddevs) + noise),
            noise_part=noise_part,
            signal_part=NormalPredictiveDistribution(mean=preds,
                                                     stddev=stddevs),
        )
예제 #10
0
파일: optimizer.py 프로젝트: syam-s/smlb
 def __init__(
     self,
     input_: TabularData,
     output: PredictiveDistribution,
     scores: Sequence[float],
     **kwargs
 ):
     super().__init__(**kwargs)
     self._input: TabularData = params.instance(input_, TabularData)
     self._output: PredictiveDistribution = params.instance(output, PredictiveDistribution)
     # total number of function evaluations during this step
     self._num_evaluations: int = params.integer(self._input.num_samples, from_=1)
     self._scores: Sequence[float] = params.any_(
         scores,
         lambda arg: params.sequence(arg, length=1, type_=float),
         lambda arg: params.sequence(arg, length=self._num_evaluations, type_=float),
     )
예제 #11
0
파일: data.py 프로젝트: syam-s/smlb
def complement(lhs: "Data", rhs: "Data", duplicates: bool = False) -> "Data":
    """(Multi)set complement of two datasets.

    This complement method does not retain duplicates by default.
    For multiset behaviour, specify 'duplicates=True'.

    Parameters:
        lhs: set A in A - B ('left hand side')
        rhs: set B in A - B ('right hand side')
        duplicates: if False (default), the returned data do not contain
            duplicate entries; if True, duplicates are taken into account.
            Both inputs and labels have to match for duplicates.

    Returns:
        Data containing all samples in lhs, but not in rhs, without duplicates
    """

    # parameter validation
    lhs = params.instance(lhs, Data)
    rhs = params.instance(rhs, Data)

    # special case: empty set
    if lhs.num_samples == 0:
        return lhs.subset()
    if rhs.num_samples == 0:
        return lhs.subset()

    # try specialized implementations
    exception = None
    try:
        if hasattr(lhs.__class__, "_complement"):
            return lhs.__class__._complement(lhs, rhs, duplicates)
    except (NotImplementedError, InvalidParameterError) as e:
        exception = e

    try:
        if hasattr(rhs.__class__, "_complement"):
            return rhs.__class__._complement(lhs, rhs, duplicates)
    except (NotImplementedError, InvalidParameterError) as e:
        exception = e

    # no specialized method found or succeeded
    raise NotImplementedError(
        "generalized (multi)set complement not implemented") from exception
예제 #12
0
파일: noise.py 프로젝트: syam-s/smlb
    def apply(self, data: Data) -> Data:
        """Transforms data.

        Parameters:
            data: labeled data to transform

        Returns:
            transformed data

        Raises:
            InvalidParameterError if Data is not labeled
        """

        data = params.instance(data, Data)
        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")

        # patch the labels() method of the data object (not class)
        # there is no need to store the old labels function as it is a class member, not an object member

        for name in ("_orig_labels", "labels", "_noise"):
            # patch if necessary by choosing a random name instead of _labels
            if name in data.__dict__:
                raise BenchmarkError(
                    f"internal error: data object already has {name} method")

        # create a copy of the dataset
        data = copy.deepcopy(data)

        # rename labels to _labels for data only
        setattr(data, "_orig_labels", getattr(data, "labels"))

        # store noise model
        setattr(data, "_noise", self._noise)

        # add wrapper as new labels() method

        def labels(self, indices=None):
            """Query labels of a sequence of samples.

            This wrapper adds noise.

            Parameters:
                indices: a sequence of sample 'indices'.
                         See 'samples()' for details.

            Returns:
                a sequence of labels
            """

            labels = self._orig_labels(indices)
            return labels + self._noise.noise(labels.shape)

        setattr(data, "labels", labels.__get__(data))

        return data
예제 #13
0
 def __init__(
     self,
     data: VectorSpaceData,
     model: Learner,
     scorer: Scorer,
     optimizers: Sequence[Optimizer],
     evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),),
     num_trials: int = 1,
     training_data: Optional[Data] = None,
 ):
     self._data = params.instance(data, VectorSpaceData)
     self._scorer = params.instance(scorer, Scorer)
     self._model = params.instance(model, Learner)
     self._optimizers = params.sequence(optimizers, type_=Optimizer)
     self._evaluations = params.tuple_(
         evaluations, lambda arg: params.instance(arg, Evaluation)
     )
     self._num_trials = params.integer(num_trials, from_=1)
     self._training_data = params.optional_(
         training_data, lambda arg: params.instance(arg, Data)
     )
예제 #14
0
    def __init__(
        self,
        data: Data,
        training: Sequence[Sampler],
        validation: Sampler,
        learners: Sequence[SupervisedLearner],
        features: DataValuedTransformation = IdentityFeatures(),
        metric: ScalarEvaluationMetric = RootMeanSquaredError(),
        evaluations: Sequence[Evaluation] = (LearningCurvePlot(),),  # todo: add table
        progressf: Optional[Callable[[int, int], None]] = None,
    ):
        """Initialize workflow.

        Parameters:
            data: labeled data
            training: sequence of Samplers, one for each training set size
            validation: Sampler for validation set
            learners: sequence of supervised regression algorithms
            features: any data-valued transformation
            metric: evaluation metric to use; root mean squared error by default
            evaluations: one or more evaluations; default are learning curve and table
            progressf: callable with two parameters, done iterations and total number of iterations
        """

        self._data = params.instance(data, Data)  # todo: params.data(..., is_labeled=True)
        if not self._data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        self._training = params.sequence(training, type_=Sampler)
        self._validation = params.instance(validation, Sampler)
        self._learners = params.sequence(learners, type_=SupervisedLearner)
        self._features = params.instance(features, Features)
        self._metric = params.instance(metric, ScalarEvaluationMetric)
        self._evaluations = params.tuple_(
            evaluations, lambda arg: params.instance(arg, Evaluation)
        )
        self._progressf = params.optional_(
            progressf, lambda arg: params.callable(arg, num_pos_or_kw=2)
        )
        if self._progressf is None:
            self._progressf = lambda *args: None
예제 #15
0
파일: noise.py 프로젝트: syam-s/smlb
    def __init__(self, noise: Noise, **kwargs):
        """Initialize state.

        Parameters:
            noise: noise model

        Returns:
            dataset with noisy labels
        """

        super().__init__(**kwargs)

        self._noise = params.instance(noise, Noise)
예제 #16
0
파일: evaluations.py 프로젝트: syam-s/smlb
    def __init__(self, configuration: Optional[EvaluationConfiguration] = None, **kwargs):
        """Initialize Evaluation.

        Parameters:
            configuration: optional configuration object controlling rendering details
        """

        super().__init__(**kwargs)

        self._configuration = params.any_(
            configuration, lambda arg: params.instance(arg, EvaluationConfiguration), params.none
        )
        if self._configuration is None:
            self._configuration = self._default_configuration()

        self._auxiliary = dict()  # internal handle on optional auxiliary outcome data
예제 #17
0
    def __init__(self, noise_part=None, signal_part=None, **kwargs):
        """Initialize decompositions.

        Parameters:
            noise_part: estimated noise distribution; the aleatoric component
            signal_part: estimated signal distribution; the epistemic component
        """

        super().__init__(**kwargs)

        optional = lambda arg: params.any_(
            arg, lambda x: params.instance(x, PredictiveDistribution), params.
            none)
        self._noise_part = optional(noise_part)
        self._signal_part = optional(signal_part)

        pass
예제 #18
0
    def fit(self, data: Data) -> Learner:
        """Fits the model to training data.

        Parameters:
            data: labeled training data

        Returns:
            self (allows chaining)

        Raises:
            InvalidParameterError if data is not labeled
        """

        data = params.instance(data, Data)
        if not data.is_labeled:
            raise InvalidParameterError("Labeled data", "unlabeled data")

        return self
예제 #19
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            # todo: there is a discrepancy between the ensemble mean and predictions
            #       until this has been resolved, naive uncertainties are not supported
            #       when fixing this, update parameter validation and unit tests
            raise NotImplementedError
        #     # #trees x #samples matrix of predictions of ensemble's trees
        #     staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred)))

        #     # this does NOT yield the same predictions as self._model.predict(xpred)
        #     mean, stddev = (
        #         np.mean(staged_preds, axis=0),
        #         np.std(staged_preds, axis=0),
        #     )
        #     return NormalPredictiveDistribution(mean=mean, stddev=stddev)
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
예제 #20
0
파일: sampling.py 프로젝트: syam-s/smlb
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random subset of data.

        Parameters:
            data: dataset to sample from

        Returns:
            random subset of data
        """

        data = params.instance(data, Data)
        if not data.is_finite:
            raise InvalidParameterError("finite Data", type(data).__name__)
        size = params.integer(
            self._size, from_=0, to=data.num_samples
        )  # validate upper bound (see __init__)

        ind = self.random.choice(data.num_samples, size=size, replace=False)

        return data.subset(ind)
예제 #21
0
파일: sampling.py 프로젝트: syam-s/smlb
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random vectors.

        Parameters:
            data: Data to draw from

        Returns:
            TabularData of vectors
        """

        data = params.instance(data, Data)
        if self._domain is None:
            if data.domain is None:
                domain = np.asarray([[0, 1]] * data.dimensions)
            else:
                domain = data.domain
        else:
            domain = params.hypercube_domain(
                self._domain, dimensions=data.dimensions
            )  # checks dimensionality (see __init__)

        for low, high in domain:
            if low == -np.inf or high == np.inf:
                raise BenchmarkError("can not sample from infinite domain")

        # vectors = np.transpose(
        #     np.asfarray(
        #         [
        #             self.random.uniform(low=low, high=high, size=self._size)
        #             for (low, high) in self._domain
        #         ]
        #     )
        # )

        # this version avoids the python loop for efficiency in high dimensions
        vectors = (
            self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0])
            + domain[:, 0]  # noqa W503
        )

        return data.subset(vectors)
예제 #22
0
    def fit(self, data: Data) -> "RandomForestRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on
        
        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
예제 #23
0
    def apply(self, data: Data) -> TabularData:
        """Compute matminer composition-based materials features.

        Parameters:
            data: material compositions, given as sum formula strings
                  Can be labeled, and labels will be retained

        Returns:
            TabularData or TabularLabeledData with matminer composition-based
            materials features as samples
        """

        data = params.instance(data, Data)

        inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples())
        features = self._mmfeatures.featurize_many(inputs_, pbar=False)
        features = np.asfarray(features)

        result = TabularData(data=features, labels=data.labels() if data.is_labeled else None)

        return result
예제 #24
0
    def _joint_data_labels(ds):
        """Single structured array for data and labels for comparison.

        Structured arrays can be used to run NumPy set methods
        on arrays with more than one dimension.
        """

        ds = params.instance(ds, TabularData)

        if is_sequence(ds._data.dtype.names):  # structured array
            lhs = ds._data
        else:  # homogeneous array, possibly many dimensions
            lhs = np.reshape(ds._data, (ds.num_samples, -1))
            lhs = lhs.view([("", ds._data.dtype)] * np.prod(lhs.shape[1:]))
            lhs = np.reshape(lhs, ds.num_samples)

        if not ds.is_labeled:
            result = lhs
        else:  # is_labeled
            # alternatives for hstack() that did not work included
            # numpy.lib.recfunctions.merge_arrays.

            if is_sequence(ds._labels.dtype.names):  # structured array
                rhs = ds._labels
            else:  # homogeneous array, possibly high-dimensional
                rhs = np.reshape(ds._labels, (ds.num_samples, -1))
                rhs = rhs.view([(str(i), rhs.dtype)
                                for i in range(np.prod(rhs.shape[1:]))])
                rhs = np.reshape(rhs, ds.num_samples)

            # lhs and rhs are structured array (views) now
            # unfortunately, np.hstack fails for these
            dtypes = lhs.dtype.descr + rhs.dtype.descr
            result = np.empty(ds.num_samples, dtype=dtypes)
            for name in lhs.dtype.names:
                result[name] = lhs[name]
            for name in rhs.dtype.names:
                result[name] = rhs[name]

        return result
예제 #25
0
파일: sampling.py 프로젝트: syam-s/smlb
    def apply(self, data: VectorSpaceData):
        """Sample set from evenly-spaced grid in a vector space.

        A specified number of samples are drawn from the smallest
        evenly-space grid of sufficient size.

        Returns:
            sampled set

        If size does not correspond exactly to a k x ... x k grid, that
        is, if size is not a power of k, the next-largest grid of size
        k+1 x ... x k+1 is created and some of its samples are removed.
        Here, k denotes the number of evenly-spaced samples per dimension.
        """

        data = params.instance(data, VectorSpaceData)

        k = self.next_grid_size(data, self._size)
        population = self.full_grid(data, samples_per_dim=k, domain=self._domain)
        ind = self.random.choice(len(population), size=self._size, replace=False)

        return data.subset(population[ind])
예제 #26
0
    def fit(self, data: Data) -> "GaussianProcessRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: labeled data to train on;
                  must derive from IndexedData and LabeledData

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
    def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(data, Data)

        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
예제 #28
0
파일: sampling.py 프로젝트: syam-s/smlb
    def full_grid(self, data: VectorSpaceData, samples_per_dim: int, domain=None):
        """Full multi-dimensional evenly-spaced grid.

        For one sample per dimension, the result is a single vector, the mean of the domain.

        Parameters:
            data: sampled dataset
            samples_per_dim: number of evenly-spaced samples to take in each dimension
            domain: (sub)domain to sample from; by default, data's domain is used

        Returns:
            two-dimensional NumPy array where samples are rows
        """

        data = params.instance(data, VectorSpaceData)
        k = params.integer(samples_per_dim, above=0)  # positive integer
        domain = data.domain if domain is None else domain
        domain = params.hypercube_domain(domain, data.dimensions)

        if k == 1:
            return np.mean(domain, axis=1).reshape((1, -1))
        locs = (np.linspace(xfrom, xto, k) for xfrom, xto in domain)
        return np.asfarray(list(itertools.product(*locs)))
예제 #29
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self
예제 #30
0
    def apply(self, data: Data) -> TabularData:
        """Compute selected molecular features.

        Parameters:
            data: molecular structures given as SMILES strings.
                  Can be labeled, and labels will be retained

        Returns:
            TabularData with CDK molecular features as samples
        """

        data = params.instance(data, Data)  # todo: params.data(data, is_finite=True)

        failmode = DataTransformationFailureMode(self._failmode, data.num_samples)

        # set up molecule SMILES
        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder)

        def parse_smiles(s: str, i: int):
            """Return parsed SMILES string or None on failure."""
            try:
                return parser.parseSmiles(self._samplef(s))
            except py4j.protocol.Py4JJavaError:
                # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException
                failmode.handle_failure(i)
                return None  # internal sentinel value

        smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples()))

        # compute descriptors
        # todo: the dtype of the columns could be set in advance by querying the descriptors
        #       currently, all values are stored as floating point numbers
        features = np.empty((data.num_samples, np.sum(self._arities)))
        index = 0

        def java_is_instance_of(object_, class_):
            return py4j.java_gateway.is_instance_of(
                self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_
            )

        def check_arity(expected, actual):
            if expected != actual:
                raise BenchmarkError(
                    f"Invalid descriptor result arity (expected {expected}, was {actual})"
                )

        for descriptor, arity in zip(self._descriptors, self._arities):
            for i, smile in enumerate(smiles):
                if smiles is None:
                    features[i, index : index + arity] = float("nan")
                    continue

                try:
                    value = descriptor.calculate(smile).getValue()
                except py4j.protocol.Py4JJavaError:
                    failmode.handle_failure(i)
                    features[i, index : index + arity] = float("nan")
                    continue

                if java_is_instance_of(value, "IntegerResult"):
                    check_arity(arity, 1)
                    features[i, index] = int(value.intValue())
                elif java_is_instance_of(value, "DoubleResult"):
                    check_arity(arity, 1)
                    features[i, index] = float(value.doubleValue())
                elif java_is_instance_of(value, "BooleanResult"):
                    check_arity(arity, 1)
                    features[i, index] = bool(value.booleanValue())
                elif java_is_instance_of(value, "IntegerArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        int(value.get(j)) for j in range(value.length())
                    )
                elif java_is_instance_of(value, "DoubleArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        float(value.get(j)) for j in range(value.length())
                    )
                # there seems to be no BooleanArrayResult in CDK
                else:
                    name = value.getClass().getSimpleName()
                    raise BenchmarkError(f"Unsupported CDK result type '{name}'")
            index += arity

        result = (
            TabularData(data=features, labels=data.labels())
            if data.is_labeled
            else TabularData(data=features)
        )

        result = failmode.finalize(result)

        return result