def __init__(self, failmode, num_samples: int): """Initialize failure handler. Parameters: failmode: how to handle failed descriptor calculations, either due to rejected SMILES encodings or failing descriptor code. Possible values: "raise" [default]: raise a Benchmarexception "drop": drop the sample. Returned Data will have fewer samples ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will be set to False for failures ("index", index): where `index` is an empty list to which the indices of failed entries will be appended num_samples: number of samples that are transformed """ self.num_samples = params.integer(num_samples, from_=0) self.failmode = self.failmode(failmode) if is_sequence(self.failmode) and self.failmode[0] == "mask": self.failmode = "mask" if len(failmode[1]) != self.num_samples: raise InvalidParameterError( "failure mode mask length of {self.num_samples}", len(self.mask)) self.mask = failmode[1] self.mask.fill(False) if is_sequence(self.failmode) and self.failmode[0] == "index": self.failmode = "index" self.index = failmode[1] self.failures = [] # list of indices of failed samples
def __init__(self, data: np.ndarray, labels: Optional[np.ndarray] = None, **kwargs): """Initialize dataset. Parameters: data: tabular data as a NumPy ndarray labels: tabular data as a NumPy ndarray. If not specified, dataset is unlabeled. Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. Examples: From numerical NumPy data: ``` TabularData(numpy.ndarray(...), ...) ``` From a Pandas DataFrame: ``` df = pandas.DataFrame(..., columns=[...]) TabularData(df.to_records(index=False), labels=...) ``` From mixed NumPy data, with column names (note use of tuples): ``` a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)]) TabularData(a, ...) ``` """ # parameter validation data = params.instance(data, np.ndarray) labels = params.optional_(labels, lambda arg: params.instance(arg, np.ndarray)) if labels is not None: # number of samples and labels must match if data.shape[0] != labels.shape[0]: raise InvalidParameterError( "same number of samples and labels", f"{data.shape[0]} samples, {labels.shape[0]} labels", ) # uniqueness of "column" names, if any, is enforced by NumPy, # but only separately for data and labels if is_sequence(data.dtype.names) and is_sequence( labels.dtype.names): column_names = data.dtype.names + labels.dtype.names if len(column_names) != len(np.unique(column_names)): raise InvalidParameterError( "unique column names for samples and labels", column_names) self._data, self._labels = data, labels super().__init__(**kwargs)
def normal_distribution(arg): """Predictive normal distribution. Parameters: arg: parameter to validate; normal predictive distributions; a pair of two same-length sequences is interpreted as means and standard deviations of independent normal predictive distributions Returns: NormalPredictiveDistribution Raises: InvalidParameterError: if arg is invalid """ # due to circular dependency from .distributions import NormalPredictiveDistribution ipe = InvalidParameterError("normal distribution", arg) try: if isinstance(arg, NormalPredictiveDistribution): pass elif (is_sequence(arg) and len(arg) == 2 and is_sequence(arg[0]) and is_sequence(arg[1]) and len(arg[0]) == len(arg[1])): # interpret as pair of two same-length sequences arg = NormalPredictiveDistribution(arg[0], arg[1]) else: raise ipe # check if arg is a normal distribution except Exception as e: raise ipe from e return arg
def failmode(failmode): """Failure mode. Validate that argument is failure mode, similar to smlb.params. See __init__ for valid values. """ ipe = InvalidParameterError("valid failure mode specification", failmode) if failmode in ("raise", "drop"): return failmode if not (is_sequence(failmode) and len(failmode) == 2): raise ipe if (failmode[0] == "mask" and isinstance(failmode[1], np.ndarray) and failmode[1].ndim == 1 and failmode[1].dtype.name == "bool"): return failmode if failmode[0] == "index" and isinstance(failmode[1], list) and len( failmode[1]) == 0: return failmode raise ipe
def distribution(arg): """Predictive distribution. Parameters: arg: parameter to validate; predictive distributions; a sequence is interpreted as specifying the means of a DeltaPredictiveDistribution Returns: PredictiveDistribution or subclass Raises: InvalidParameterError: if arg is invalid """ # due to circular dependency from .distributions import PredictiveDistribution, DeltaPredictiveDistribution ipe = InvalidParameterError("distribution", arg) try: if isinstance(arg, PredictiveDistribution): pass elif is_sequence(arg): # interpret as sequence of means arg = np.asfarray(arg) if len(arg.shape) != 1: raise ipe arg = DeltaPredictiveDistribution(arg) else: raise ipe except Exception as e: raise ipe from e return arg
def tuple_(arg, testf, *args, arity=None, default=NONE): """k-tuple meta-test. If arity is larger than the number of test functions provided, the last test function is repeatedly used. This enables `tuple(..., f, arity=3)` for homogeneous-type tuples. Parameters: arg: parameter to validate as a tuple testf: test function that accepts a single argument and validates it arbitrarily many further test functions can be passed arity: length of tuple default: if specified and arity as well, too-short tuples are extended with default value Returns: arg if it is a tuple and every component is successfully validated Raises: InvalidParameterError if arg is not a sequence or one of the test functions fails """ if arity is None: arity = max(len(args) + 1, len(arg) if is_sequence(arg) else 0) ipe = InvalidParameterError( f"{arity}-tuple with valid components (tuple_)", arg) if not is_sequence(arg) or len(arg) > arity: raise ipe if len(arg) < arity: if default != params.NONE: arg = arg + tuple(default for _ in range(arity - len(arg))) else: raise ipe try: testf = (testf, *args) return tuple(testf[i if i < len(testf) else -1](arg[i]) for i in range(arity)) except InvalidParameterError as e: raise ipe from e
def _joint_data_labels(ds): """Single structured array for data and labels for comparison. Structured arrays can be used to run NumPy set methods on arrays with more than one dimension. """ ds = params.instance(ds, TabularData) if is_sequence(ds._data.dtype.names): # structured array lhs = ds._data else: # homogeneous array, possibly many dimensions lhs = np.reshape(ds._data, (ds.num_samples, -1)) lhs = lhs.view([("", ds._data.dtype)] * np.prod(lhs.shape[1:])) lhs = np.reshape(lhs, ds.num_samples) if not ds.is_labeled: result = lhs else: # is_labeled # alternatives for hstack() that did not work included # numpy.lib.recfunctions.merge_arrays. if is_sequence(ds._labels.dtype.names): # structured array rhs = ds._labels else: # homogeneous array, possibly high-dimensional rhs = np.reshape(ds._labels, (ds.num_samples, -1)) rhs = rhs.view([(str(i), rhs.dtype) for i in range(np.prod(rhs.shape[1:]))]) rhs = np.reshape(rhs, ds.num_samples) # lhs and rhs are structured array (views) now # unfortunately, np.hstack fails for these dtypes = lhs.dtype.descr + rhs.dtype.descr result = np.empty(ds.num_samples, dtype=dtypes) for name in lhs.dtype.names: result[name] = lhs[name] for name in rhs.dtype.names: result[name] = rhs[name] return result
def render(self): """Renders evaluation. Specific derived classes should override `_render`, not this method. """ target = self._target # shortcut # if sequence of targets, render each of them if is_sequence(target) and not isinstance(target[0], mpl.figure.Figure): for tgt in target: self.render(tgt) return # process single target if isinstance(target, mpl.axes.Axes): target = (plt.gcf(), target) # remember filename for export if isinstance(target, str): filename = target target = None else: filename = None # create new plot if necessary owner = False if target is None: owner = True target = plt.subplots() self._figax = target # tuple(Figure, Axes) # set matplotlib plot settings # at this time, settings such as axes labels or scales contain the # correct values, but have not been set yet as the figure and axes # were just created. re-assignment sets (or 'activates') these values # for the new figure and axes. self.axes_labels = self.axes_labels self.axes_scales = self.axes_scales self._render(target) # export to filename if requested if filename is not None: self.fig.savefig(filename, bbox_inches="tight", pad_inches=0) # clean up if owner of Axes if owner: plt.close(self.fig) # fig.clear() might not release all memory self._figax = None
def test_is_sequence_examples(): """Tests whether is_sequence complies to docstring via examples.""" assert smlb.is_sequence([1, 2, 3]), "list" assert smlb.is_sequence((1, 2, 3)), "tuple" assert smlb.is_sequence(np.asfarray([1, 2, 3])), "array" assert not smlb.is_sequence("str"), "string" assert not smlb.is_sequence(b"bytes"), "bytes" assert not smlb.is_sequence(dict(a=1, b=2)), "dictionary" assert not smlb.is_sequence({1, 2, 3}), "set"
def sequence(arg, length=None, type_=None, testf=None): """Sequence. Sequence, of given length and type if specified. Parameters: arg: parameter to be validated as a sequence length: required length of sequence or None (default) type_: required type for all sequence elements or None (default) Returns: arg if a sequence Raises: InvalidParameterError if arg is not a sequence, of given length and type if specified """ ipe_length = "" if length is None else f" of length {length}" ipe_type = "" if type_ is None else f" of type {type(type_).__name__}" ipe_testf = "" if testf is None else " with constraints" ipe = InvalidParameterError( f"a sequence{ipe_length}{ipe_type}{ipe_testf}", arg) if not is_sequence(arg): raise ipe if length is not None: if len(arg) != length: raise ipe if type_ is not None: if not all(isinstance(el, type_) for el in arg): raise ipe if testf is not None: try: for el in arg: testf(el) except Exception as e: raise ipe from e return arg
def test_orient(): """Test orient argument for oriented metrics.""" classes = ( smlb.MeanAbsoluteError, smlb.MeanSquaredError, smlb.RootMeanSquaredError, smlb.MeanLogPredictiveDensity, smlb.MeanContinuousRankedProbabilityScore, ) true = smlb.NormalPredictiveDistribution([1, 2, 3], [0.5, 0.6, 0.7]) pred = smlb.NormalPredictiveDistribution([1.1, 2.2, 2.9], [0.4, 0.7, 0.65]) for c in classes: resa, resb = c(orient=-1)(true, pred), c(orient=+1)(true, pred) if smlb.is_sequence(resa): assert (resa == -resb).all(), c.__name__ else: assert resa == -resb, c.__name__ with pytest.raises(Exception): c(orientt=-1) # ensure misspelt argument raises
def evaluate(self, results, **kwargs): """Compute plot data for multiple generalized (set-valued) functions. Multiple curves C_1, ..., C_k can be drawn. Each curve C_i is specified by a non-empty sequence of 2-tuples, where the first value is location on horizontal axis, and the other value is a sequence of locations on the vertical axis. Each curve can be drawn in a different way (points, box-whisker). Parameters: results: sequence of generalized functions data (curve data). Each datum is a sequence of tuples (x,fx), where x is a real number and fx is a sequence of real numbers. Examples: # two curves sharing one horizontal location evaluate([ [(1,(1,0.9,1.1)), (3,(2,))], # curve 1 [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2 ]) """ super().evaluate(results=results, **kwargs) # parameter validation tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2) curve_testf = lambda arg: params.tuple_(arg, tuple_testf) results = params.tuple_(results, curve_testf) # _rectify evaluates to True if True or if > 0 if len(results) > len(self.RECTIFY_DELTAS) and self._rectify: raise InvalidParameterError( f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves" ) # finalize parameter validation for visualization_type if not is_sequence(self._visualization_type): self._visualization_type = (self._visualization_type,) * len(results) self._visualization_type = params.tuple_( self._visualization_type, lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}), arity=len(results), default="points", ) # prepare plot # determine all distinct horizontal positons in the results data all_positions = np.unique([entry[0] for curve in results for entry in curve]) # there is nothing to do without data to plot if len(all_positions) == 0: self._plotdata = [] return # do not rectify if there is only a single horizontal position if len(all_positions) == 1 or self._rectify is False: self._rectify = 0.0 # automatic determination of horizontal rectification factor # # the correct way to draw box-plots on a logarithmic horizontal axis is to have # different left-width and right-width of the boxes. However, matplotlib does not # support this. Because box widths are small compared to horizontal plot range, # it suffices to use the sum of left- and right-half widths. between_groups_spacing = 0.4 in_group_spacing = 0.9 # box-whisker plots if self.axes_scales[0] == "linear": logf = lambda arg: arg powf = lambda arg: arg elif self.axes_scales[0] == "log": base = 10 logf = lambda arg: np.log(arg) / np.log(base) powf = lambda arg: np.power(base, arg) if self._rectify is True: # diff(...) requires at least two horizontal locations; this is ensured above self._rectify = ( between_groups_spacing * min(np.diff(logf(all_positions))) / len(results) ) # determine positions self._plotdata = [None] * len(results) deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results)) for (i, curve) in enumerate(results): # point markers, every single point is drawn if self._visualization_type[i] == "points": positions = powf( np.hstack( [ logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2 for entry in curve ] ) ) values = np.hstack([entry[1] for entry in curve]) self._plotdata[i] = np.transpose([positions, values]) # box-whisker plots elif self._visualization_type[i] == "box-whisker": positions = np.asfarray( [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve] ) values = [entry[1] for entry in curve] # can't use rectify for width if 0; 1 is a wild guess # todo: if plot ranges have been set, a better default value could # be 10% of horizontal plot range w = 1 if not self._rectify else self._rectify widths = powf((positions + w / 2) * in_group_spacing) - powf( (positions - w / 2) * in_group_spacing ) positions = powf(positions) self._plotdata[i] = (positions, values, widths) elif self._visualization_type[i] == "shaded-line": positions = np.asfarray([entry[0] for entry in curve]) values = [entry[1] for entry in curve] self._plotdata[i] = (positions, values) else: raise BenchmarkError("internal error, unknown visualization type")