Exemplo n.º 1
0
def validate_data_interface(ds: smlb.Data) -> bool:
    """Tests for compliance with Data interface.

    Runs tests that every Data-compliant class should satisfy.

    Returns:
        True

    Raises:
        AssertionError for failed tests
    """

    # actual or "virtual" abc inheritance
    assert isinstance(ds, smlb.Data)

    if ds.num_samples == float("inf"):
        # infinite data tests
        pass

    else:
        # finite data test

        # integer-representable non-negative size
        assert int(ds.num_samples) == ds.num_samples
        assert ds.num_samples >= 0

        # all samples are returned
        assert len(ds.samples()) == ds.num_samples

        # subsets
        assert ds.subset([]).num_samples == 0
        assert ds.subset().num_samples <= ds.num_samples
        assert ds.subset(duplicates=True).num_samples == ds.num_samples

        # intersection with self
        assert smlb.intersection(ds, ds).num_samples <= ds.num_samples
        # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples  # todo: support this as well

        # complement with self
        assert smlb.complement(ds, ds).num_samples == 0
        # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0  # todo: support this as well

        if ds.is_labeled:
            # all labels are returned
            assert len(ds.labels()) == ds.num_samples

            # subsets
            assert ds.subset([]).is_labeled
            assert ds.subset().is_labeled

            # intersection
            assert smlb.intersection(ds, ds).is_labeled
            # assert smlb.intersection(ds, ds, duplicates=True).is_labeled  # todo: support this as well

            # complement
            assert smlb.complement(ds, ds).is_labeled
            # assert smlb.complement(ds, ds, duplicates=True).is_labeled  # todo: support this as well

    return True
Exemplo n.º 2
0
    def apply(self, data: Data) -> TabularData:
        """Compute matminer composition-based materials features.

        Parameters:
            data: material compositions, given as sum formula strings
                  Can be labeled, and labels will be retained

        Returns:
            TabularData or TabularLabeledData with matminer composition-based
            materials features as samples
        """

        data = params.instance(data, Data)

        inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples())
        features = self._mmfeatures.featurize_many(inputs_, pbar=False)
        features = np.asfarray(features)

        result = TabularData(data=features, labels=data.labels() if data.is_labeled else None)

        return result
Exemplo n.º 3
0
    def fit(self, data: Data) -> "RandomForestRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on
        
        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
    def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(data, Data)

        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Exemplo n.º 5
0
    def fit(self, data: Data) -> "GaussianProcessRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: labeled data to train on;
                  must derive from IndexedData and LabeledData

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Exemplo n.º 6
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self
Exemplo n.º 7
0
    def apply(self, data: Data) -> TabularData:
        """Compute selected molecular features.

        Parameters:
            data: molecular structures given as SMILES strings.
                  Can be labeled, and labels will be retained

        Returns:
            TabularData with CDK molecular features as samples
        """

        data = params.instance(data, Data)  # todo: params.data(data, is_finite=True)

        failmode = DataTransformationFailureMode(self._failmode, data.num_samples)

        # set up molecule SMILES
        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder)

        def parse_smiles(s: str, i: int):
            """Return parsed SMILES string or None on failure."""
            try:
                return parser.parseSmiles(self._samplef(s))
            except py4j.protocol.Py4JJavaError:
                # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException
                failmode.handle_failure(i)
                return None  # internal sentinel value

        smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples()))

        # compute descriptors
        # todo: the dtype of the columns could be set in advance by querying the descriptors
        #       currently, all values are stored as floating point numbers
        features = np.empty((data.num_samples, np.sum(self._arities)))
        index = 0

        def java_is_instance_of(object_, class_):
            return py4j.java_gateway.is_instance_of(
                self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_
            )

        def check_arity(expected, actual):
            if expected != actual:
                raise BenchmarkError(
                    f"Invalid descriptor result arity (expected {expected}, was {actual})"
                )

        for descriptor, arity in zip(self._descriptors, self._arities):
            for i, smile in enumerate(smiles):
                if smiles is None:
                    features[i, index : index + arity] = float("nan")
                    continue

                try:
                    value = descriptor.calculate(smile).getValue()
                except py4j.protocol.Py4JJavaError:
                    failmode.handle_failure(i)
                    features[i, index : index + arity] = float("nan")
                    continue

                if java_is_instance_of(value, "IntegerResult"):
                    check_arity(arity, 1)
                    features[i, index] = int(value.intValue())
                elif java_is_instance_of(value, "DoubleResult"):
                    check_arity(arity, 1)
                    features[i, index] = float(value.doubleValue())
                elif java_is_instance_of(value, "BooleanResult"):
                    check_arity(arity, 1)
                    features[i, index] = bool(value.booleanValue())
                elif java_is_instance_of(value, "IntegerArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        int(value.get(j)) for j in range(value.length())
                    )
                elif java_is_instance_of(value, "DoubleArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        float(value.get(j)) for j in range(value.length())
                    )
                # there seems to be no BooleanArrayResult in CDK
                else:
                    name = value.getClass().getSimpleName()
                    raise BenchmarkError(f"Unsupported CDK result type '{name}'")
            index += arity

        result = (
            TabularData(data=features, labels=data.labels())
            if data.is_labeled
            else TabularData(data=features)
        )

        result = failmode.finalize(result)

        return result