def validate_data_interface(ds: smlb.Data) -> bool: """Tests for compliance with Data interface. Runs tests that every Data-compliant class should satisfy. Returns: True Raises: AssertionError for failed tests """ # actual or "virtual" abc inheritance assert isinstance(ds, smlb.Data) if ds.num_samples == float("inf"): # infinite data tests pass else: # finite data test # integer-representable non-negative size assert int(ds.num_samples) == ds.num_samples assert ds.num_samples >= 0 # all samples are returned assert len(ds.samples()) == ds.num_samples # subsets assert ds.subset([]).num_samples == 0 assert ds.subset().num_samples <= ds.num_samples assert ds.subset(duplicates=True).num_samples == ds.num_samples # intersection with self assert smlb.intersection(ds, ds).num_samples <= ds.num_samples # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples # todo: support this as well # complement with self assert smlb.complement(ds, ds).num_samples == 0 # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0 # todo: support this as well if ds.is_labeled: # all labels are returned assert len(ds.labels()) == ds.num_samples # subsets assert ds.subset([]).is_labeled assert ds.subset().is_labeled # intersection assert smlb.intersection(ds, ds).is_labeled # assert smlb.intersection(ds, ds, duplicates=True).is_labeled # todo: support this as well # complement assert smlb.complement(ds, ds).is_labeled # assert smlb.complement(ds, ds, duplicates=True).is_labeled # todo: support this as well return True
def apply(self, data: Data) -> TabularData: """Compute matminer composition-based materials features. Parameters: data: material compositions, given as sum formula strings Can be labeled, and labels will be retained Returns: TabularData or TabularLabeledData with matminer composition-based materials features as samples """ data = params.instance(data, Data) inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples()) features = self._mmfeatures.featurize_many(inputs_, pbar=False) features = np.asfarray(features) result = TabularData(data=features, labels=data.labels() if data.is_labeled else None) return result
def fit(self, data: Data) -> "RandomForestRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "GaussianProcessRegressionSklearn": """Fits the model using training data. Parameters: data: labeled data to train on; must derive from IndexedData and LabeledData Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "RandomForestRegressionLolo": """Fits the model using training data. Parameters: data: labeled tabular data to train on Returns: self (allows chaining) """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) try: self._model.fit(xtrain, ytrain) except Py4JJavaError as e: raise BenchmarkError("training lolo model failed") from e return self
def apply(self, data: Data) -> TabularData: """Compute selected molecular features. Parameters: data: molecular structures given as SMILES strings. Can be labeled, and labels will be retained Returns: TabularData with CDK molecular features as samples """ data = params.instance(data, Data) # todo: params.data(data, is_finite=True) failmode = DataTransformationFailureMode(self._failmode, data.num_samples) # set up molecule SMILES builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder) def parse_smiles(s: str, i: int): """Return parsed SMILES string or None on failure.""" try: return parser.parseSmiles(self._samplef(s)) except py4j.protocol.Py4JJavaError: # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException failmode.handle_failure(i) return None # internal sentinel value smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples())) # compute descriptors # todo: the dtype of the columns could be set in advance by querying the descriptors # currently, all values are stored as floating point numbers features = np.empty((data.num_samples, np.sum(self._arities))) index = 0 def java_is_instance_of(object_, class_): return py4j.java_gateway.is_instance_of( self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_ ) def check_arity(expected, actual): if expected != actual: raise BenchmarkError( f"Invalid descriptor result arity (expected {expected}, was {actual})" ) for descriptor, arity in zip(self._descriptors, self._arities): for i, smile in enumerate(smiles): if smiles is None: features[i, index : index + arity] = float("nan") continue try: value = descriptor.calculate(smile).getValue() except py4j.protocol.Py4JJavaError: failmode.handle_failure(i) features[i, index : index + arity] = float("nan") continue if java_is_instance_of(value, "IntegerResult"): check_arity(arity, 1) features[i, index] = int(value.intValue()) elif java_is_instance_of(value, "DoubleResult"): check_arity(arity, 1) features[i, index] = float(value.doubleValue()) elif java_is_instance_of(value, "BooleanResult"): check_arity(arity, 1) features[i, index] = bool(value.booleanValue()) elif java_is_instance_of(value, "IntegerArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( int(value.get(j)) for j in range(value.length()) ) elif java_is_instance_of(value, "DoubleArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( float(value.get(j)) for j in range(value.length()) ) # there seems to be no BooleanArrayResult in CDK else: name = value.getClass().getSimpleName() raise BenchmarkError(f"Unsupported CDK result type '{name}'") index += arity result = ( TabularData(data=features, labels=data.labels()) if data.is_labeled else TabularData(data=features) ) result = failmode.finalize(result) return result