예제 #1
0
    def finalize(self, data: Data) -> Data:
        """Change dataset according to registered failures and failure mode.

        Parameters:
            data: transformed Data

        Returns:
            Transformed Data after handling failures.
        """

        self.failures = sorted(list(set(
            self.failures)))  # remove duplicate indices

        if self.failmode == "raise":
            if len(self.failures) > 0:
                raise BenchmarkError(
                    "DataTransformation failed for some samples")
            return data
        elif self.failmode == "drop":
            return complement(data,
                              data.subset(self.failures))  # todo: duplicates?
        elif self.failmode == "mask":
            self.mask[self.failures] = True
            return data
        elif self.failmode == "index":
            self.index.extend(self.failures)
            return data

        raise BenchmarkError(
            f"Internal error, unrecognized failure mode '{self.failmode}'")
예제 #2
0
    def apply(self, data: Data) -> PredictiveDistribution:
        """Predicts new inputs.

        Parameters:
            data: finite indexed data to predict

        Returns:
            predictive normal distributions if predictive uncertainties were requested,
            otherwise delta distributions
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)

        xpred = params.real_matrix(data.samples())

        if self._with_uncertainties:
            try:
                preds, stddevs = self._model.predict(xpred, return_std=True)
                return NormalPredictiveDistribution(mean=preds, stddev=stddevs)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
        else:
            try:
                preds = self._model.predict(xpred, return_std=False)
                return DeltaPredictiveDistribution(mean=preds)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
예제 #3
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(data, Data)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            return NormalPredictiveDistribution(
                mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
            )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
예제 #4
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's RandomForestRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None and self._correlations is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            if self._correlations is None:
                return NormalPredictiveDistribution(
                    mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                )
            elif self._correlations == "naive":
                if (data.num_samples > 25000) and not self._force_corr:
                    warn(
                        "Input correlations requested for >2.5E4 predictions."
                        " Corelation matrix will not be computed, because a matrix this large may"
                        " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)."
                        " To force computation anyway, set `force_corr = True` in learner constructor.",
                        UserWarning,
                    )
                    return NormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                    )
                else:
                    # Must handle single-prediction separately, as in this case np.corrcoef
                    # will return single number rather than 1x1 array.
                    if preds.shape[1] == 1:
                        corr = np.array([[1]])
                    else:
                        corr = np.corrcoef(preds, rowvar=False)
                    return CorrelatedNormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr
                    )
            else:
                raise BenchmarkError(
                    "internal error, unknown parameter for correlations of RandomForestRegressionSklearn"
                )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn"
            )
예제 #5
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        For Gaussian processes, both the noise-free predictive (posterior)
        distribution as well as the noise estimate are normally distributed.
        The predictive distribution with noise is the sum of the former two.

        The $\alpha$ training noise specified at initialization time is not
        added at prediction time, and thus not part of the noise model.
        The current implementation considers contributions from any
        WhiteKernel or other kernel that has a hyperparameter 'noise_level'.

        Limitations:
            It is a currently accepted shortcoming that WhiteKernels that are
            not 'first-level' sum members might yield wrong noise models.
            Examples:
            WhiteKernel(...) + other kernels will work
            kernel(...) * WhiteKernel(...) will not work as intended

            Training data noise $\alpha$ is not added

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution with the following decomposition:
                predicted: sum of model and noise distribution
                noise_part: normal distribution for estimated noise
                signal_part: normal distribution for estimated model contribution;
                             the Gaussian process' "predictive variance";
                             depends only on distance from the training data
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())
        n = data.num_samples

        # predict
        preds, stddevs = self._model.predict(xpred, return_std=True)

        # noise
        # noise are all noise_level of WhiteKernel, where noise_level is variance (not standard deviation)
        # this assumes that the noise level are independent
        noise = tuple(v for k, v in self._model.kernel_.get_params().items()
                      if k.endswith("noise_level"))
        noise = np.ones(shape=n) * np.sum(noise)
        noise_part = NormalPredictiveDistribution(mean=np.zeros(shape=n),
                                                  stddev=np.sqrt(noise))

        return NormalPredictiveDistribution(
            mean=preds,
            stddev=np.sqrt(np.square(stddevs) + noise),
            noise_part=noise_part,
            signal_part=NormalPredictiveDistribution(mean=preds,
                                                     stddev=stddevs),
        )
예제 #6
0
 def apply(self, data: Data):
     if not data.is_finite:
         raise InvalidParameterError(
             "a finite dataset",
             f"an infinite dataset of type {data.__class__}")
     means = self._function.labels(data.samples())
     stddevs = np.zeros_like(means)
     return NormalPredictiveDistribution(means, stddevs)
예제 #7
0
def validate_data_interface(ds: smlb.Data) -> bool:
    """Tests for compliance with Data interface.

    Runs tests that every Data-compliant class should satisfy.

    Returns:
        True

    Raises:
        AssertionError for failed tests
    """

    # actual or "virtual" abc inheritance
    assert isinstance(ds, smlb.Data)

    if ds.num_samples == float("inf"):
        # infinite data tests
        pass

    else:
        # finite data test

        # integer-representable non-negative size
        assert int(ds.num_samples) == ds.num_samples
        assert ds.num_samples >= 0

        # all samples are returned
        assert len(ds.samples()) == ds.num_samples

        # subsets
        assert ds.subset([]).num_samples == 0
        assert ds.subset().num_samples <= ds.num_samples
        assert ds.subset(duplicates=True).num_samples == ds.num_samples

        # intersection with self
        assert smlb.intersection(ds, ds).num_samples <= ds.num_samples
        # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples  # todo: support this as well

        # complement with self
        assert smlb.complement(ds, ds).num_samples == 0
        # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0  # todo: support this as well

        if ds.is_labeled:
            # all labels are returned
            assert len(ds.labels()) == ds.num_samples

            # subsets
            assert ds.subset([]).is_labeled
            assert ds.subset().is_labeled

            # intersection
            assert smlb.intersection(ds, ds).is_labeled
            # assert smlb.intersection(ds, ds, duplicates=True).is_labeled  # todo: support this as well

            # complement
            assert smlb.complement(ds, ds).is_labeled
            # assert smlb.complement(ds, ds, duplicates=True).is_labeled  # todo: support this as well

    return True
예제 #8
0
    def apply(self, data: Data) -> TabularData:
        """Compute matminer composition-based materials features.

        Parameters:
            data: material compositions, given as sum formula strings
                  Can be labeled, and labels will be retained

        Returns:
            TabularData or TabularLabeledData with matminer composition-based
            materials features as samples
        """

        data = params.instance(data, Data)

        inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples())
        features = self._mmfeatures.featurize_many(inputs_, pbar=False)
        features = np.asfarray(features)

        result = TabularData(data=features, labels=data.labels() if data.is_labeled else None)

        return result
예제 #9
0
    def fit(self, data: Data) -> "RandomForestRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on
        
        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
    def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(data, Data)

        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
예제 #11
0
    def fit(self, data: Data) -> "GaussianProcessRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: labeled data to train on;
                  must derive from IndexedData and LabeledData

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
예제 #12
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self
예제 #13
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            # todo: there is a discrepancy between the ensemble mean and predictions
            #       until this has been resolved, naive uncertainties are not supported
            #       when fixing this, update parameter validation and unit tests
            raise NotImplementedError
        #     # #trees x #samples matrix of predictions of ensemble's trees
        #     staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred)))

        #     # this does NOT yield the same predictions as self._model.predict(xpred)
        #     mean, stddev = (
        #         np.mean(staged_preds, axis=0),
        #         np.std(staged_preds, axis=0),
        #     )
        #     return NormalPredictiveDistribution(mean=mean, stddev=stddev)
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
예제 #14
0
파일: sampling.py 프로젝트: syam-s/smlb
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random subset of data.

        Parameters:
            data: dataset to sample from

        Returns:
            random subset of data
        """

        data = params.instance(data, Data)
        if not data.is_finite:
            raise InvalidParameterError("finite Data", type(data).__name__)
        size = params.integer(
            self._size, from_=0, to=data.num_samples
        )  # validate upper bound (see __init__)

        ind = self.random.choice(data.num_samples, size=size, replace=False)

        return data.subset(ind)
예제 #15
0
파일: sampling.py 프로젝트: syam-s/smlb
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random vectors.

        Parameters:
            data: Data to draw from

        Returns:
            TabularData of vectors
        """

        data = params.instance(data, Data)
        if self._domain is None:
            if data.domain is None:
                domain = np.asarray([[0, 1]] * data.dimensions)
            else:
                domain = data.domain
        else:
            domain = params.hypercube_domain(
                self._domain, dimensions=data.dimensions
            )  # checks dimensionality (see __init__)

        for low, high in domain:
            if low == -np.inf or high == np.inf:
                raise BenchmarkError("can not sample from infinite domain")

        # vectors = np.transpose(
        #     np.asfarray(
        #         [
        #             self.random.uniform(low=low, high=high, size=self._size)
        #             for (low, high) in self._domain
        #         ]
        #     )
        # )

        # this version avoids the python loop for efficiency in high dimensions
        vectors = (
            self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0])
            + domain[:, 0]  # noqa W503
        )

        return data.subset(vectors)
예제 #16
0
    def apply(self, data: Data) -> TabularData:
        """Compute selected molecular features.

        Parameters:
            data: molecular structures given as SMILES strings.
                  Can be labeled, and labels will be retained

        Returns:
            TabularData with CDK molecular features as samples
        """

        data = params.instance(data, Data)  # todo: params.data(data, is_finite=True)

        failmode = DataTransformationFailureMode(self._failmode, data.num_samples)

        # set up molecule SMILES
        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder)

        def parse_smiles(s: str, i: int):
            """Return parsed SMILES string or None on failure."""
            try:
                return parser.parseSmiles(self._samplef(s))
            except py4j.protocol.Py4JJavaError:
                # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException
                failmode.handle_failure(i)
                return None  # internal sentinel value

        smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples()))

        # compute descriptors
        # todo: the dtype of the columns could be set in advance by querying the descriptors
        #       currently, all values are stored as floating point numbers
        features = np.empty((data.num_samples, np.sum(self._arities)))
        index = 0

        def java_is_instance_of(object_, class_):
            return py4j.java_gateway.is_instance_of(
                self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_
            )

        def check_arity(expected, actual):
            if expected != actual:
                raise BenchmarkError(
                    f"Invalid descriptor result arity (expected {expected}, was {actual})"
                )

        for descriptor, arity in zip(self._descriptors, self._arities):
            for i, smile in enumerate(smiles):
                if smiles is None:
                    features[i, index : index + arity] = float("nan")
                    continue

                try:
                    value = descriptor.calculate(smile).getValue()
                except py4j.protocol.Py4JJavaError:
                    failmode.handle_failure(i)
                    features[i, index : index + arity] = float("nan")
                    continue

                if java_is_instance_of(value, "IntegerResult"):
                    check_arity(arity, 1)
                    features[i, index] = int(value.intValue())
                elif java_is_instance_of(value, "DoubleResult"):
                    check_arity(arity, 1)
                    features[i, index] = float(value.doubleValue())
                elif java_is_instance_of(value, "BooleanResult"):
                    check_arity(arity, 1)
                    features[i, index] = bool(value.booleanValue())
                elif java_is_instance_of(value, "IntegerArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        int(value.get(j)) for j in range(value.length())
                    )
                elif java_is_instance_of(value, "DoubleArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        float(value.get(j)) for j in range(value.length())
                    )
                # there seems to be no BooleanArrayResult in CDK
                else:
                    name = value.getClass().getSimpleName()
                    raise BenchmarkError(f"Unsupported CDK result type '{name}'")
            index += arity

        result = (
            TabularData(data=features, labels=data.labels())
            if data.is_labeled
            else TabularData(data=features)
        )

        result = failmode.finalize(result)

        return result