def validate_data_interface(ds: smlb.Data) -> bool: """Tests for compliance with Data interface. Runs tests that every Data-compliant class should satisfy. Returns: True Raises: AssertionError for failed tests """ # actual or "virtual" abc inheritance assert isinstance(ds, smlb.Data) if ds.num_samples == float("inf"): # infinite data tests pass else: # finite data test # integer-representable non-negative size assert int(ds.num_samples) == ds.num_samples assert ds.num_samples >= 0 # all samples are returned assert len(ds.samples()) == ds.num_samples # subsets assert ds.subset([]).num_samples == 0 assert ds.subset().num_samples <= ds.num_samples assert ds.subset(duplicates=True).num_samples == ds.num_samples # intersection with self assert smlb.intersection(ds, ds).num_samples <= ds.num_samples # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples # todo: support this as well # complement with self assert smlb.complement(ds, ds).num_samples == 0 # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0 # todo: support this as well if ds.is_labeled: # all labels are returned assert len(ds.labels()) == ds.num_samples # subsets assert ds.subset([]).is_labeled assert ds.subset().is_labeled # intersection assert smlb.intersection(ds, ds).is_labeled # assert smlb.intersection(ds, ds, duplicates=True).is_labeled # todo: support this as well # complement assert smlb.complement(ds, ds).is_labeled # assert smlb.complement(ds, ds, duplicates=True).is_labeled # todo: support this as well return True
def finalize(self, data: Data) -> Data: """Change dataset according to registered failures and failure mode. Parameters: data: transformed Data Returns: Transformed Data after handling failures. """ self.failures = sorted(list(set( self.failures))) # remove duplicate indices if self.failmode == "raise": if len(self.failures) > 0: raise BenchmarkError( "DataTransformation failed for some samples") return data elif self.failmode == "drop": return complement(data, data.subset(self.failures)) # todo: duplicates? elif self.failmode == "mask": self.mask[self.failures] = True return data elif self.failmode == "index": self.index.extend(self.failures) return data raise BenchmarkError( f"Internal error, unrecognized failure mode '{self.failmode}'")
def apply(self, data: Data, **kwargs) -> Data: """Draw random subset of data. Parameters: data: dataset to sample from Returns: random subset of data """ data = params.instance(data, Data) if not data.is_finite: raise InvalidParameterError("finite Data", type(data).__name__) size = params.integer( self._size, from_=0, to=data.num_samples ) # validate upper bound (see __init__) ind = self.random.choice(data.num_samples, size=size, replace=False) return data.subset(ind)
def apply(self, data: Data, **kwargs) -> Data: """Draw random vectors. Parameters: data: Data to draw from Returns: TabularData of vectors """ data = params.instance(data, Data) if self._domain is None: if data.domain is None: domain = np.asarray([[0, 1]] * data.dimensions) else: domain = data.domain else: domain = params.hypercube_domain( self._domain, dimensions=data.dimensions ) # checks dimensionality (see __init__) for low, high in domain: if low == -np.inf or high == np.inf: raise BenchmarkError("can not sample from infinite domain") # vectors = np.transpose( # np.asfarray( # [ # self.random.uniform(low=low, high=high, size=self._size) # for (low, high) in self._domain # ] # ) # ) # this version avoids the python loop for efficiency in high dimensions vectors = ( self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0]) + domain[:, 0] # noqa W503 ) return data.subset(vectors)