def box_whisker(self, positions, values, color=0, widths=0.5, **kwargs): """Draw box-whisker plots. Parameter: positions: where to place plots on horizontal axis values: samples for each location color: color index widths: widths of boxes """ positions = params.real_vector(positions) point_set_f = lambda arg: params.real_vector(arg) values = params.tuple_(values, params.real_vector, arity=len(positions)) color = params.integer(color, from_=0, below=len(self.configuration.color_set)) widths = params.real_vector(widths, dimensions=len(positions), domain=(0, 999)) color = self.configuration.color(color) self.ax.boxplot( values, positions=positions, whis=(0, 100), bootstrap=None, widths=widths, notch=False, showmeans=True, boxprops={"color": color}, whiskerprops={"color": color}, capprops={"color": color}, meanprops={"marker": "*", "markerfacecolor": color, "markeredgecolor": color}, medianprops={"color": color}, manage_ticks=False, **kwargs, )
def test_real_vector2(): """Accumulated test cases.""" # only (a,b) is extended, [(a,b)] is not assert ( params.real_vector([0.5, 0.5, 1], dimensions=3, domain=(0, np.inf)) == np.asfarray([0.5, 0.5, 1]) ).all() with pytest.raises(InvalidParameterError): params.real_vector([0.5, 0.5, 1], dimensions=3, domain=[(0, np.inf)])
def __init__(self, internal_hp_optimization: bool = True, kernel: Optional[Kernel] = None, alpha: Union[float, Sequence] = 1e-5, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, normalize_y=False, random_state: int = None, **kwargs): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: internal_hp_optimization: if True, hyperparameters are optimized "internally" by the Gaussian process, that is, scikit-learn optimizes hyperparameters and for smlb the learner has no hyperparameters; if False, hyperparameters are optimized by smlb (and scikit-learn does not optimize any hyperparameters) kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal. Equivalent to adding a "WhiteKernel"; the default is the corresponding value from scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor. optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True normalize_y: whether to subtract the mean of the labels random_state: integer seed See skl.gaussian_process.GaussianProcessRegressor parameters. """ super().__init__(**kwargs) internal_hp_optimization = params.boolean(internal_hp_optimization) kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel), params.none) # incomplete check for alpha as dimension becomes known only at fitting time alpha = params.any_( alpha, lambda arg: params.real(arg, from_=0), lambda arg: params.real_vector(arg, domain=[0, np.inf]), ) # todo: check optimizer, requires params.union (of string and callable) and params.function normalize_y = params.boolean(normalize_y) random_state = params.integer(random_state) if kernel is None: kernel = skl.gaussian_process.kernels.RBF( ) + skl.gaussian_process.kernels.WhiteKernel() assert internal_hp_optimization is True # external HP optimization not yet supported self._model = skl.gaussian_process.GaussianProcessRegressor( kernel=kernel, alpha=alpha, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, normalize_y=normalize_y, random_state=random_state, )
def __init__(self, mean, stddev, **kwargs): """Initialize state. The normal distribution is completely characterized by its mean and standard deviation. Parameters: mean: a sequence of means (floats) stddev: a sequence of standard deviations (non-negative floats) """ super().__init__(**kwargs) self._mean = params.real_vector(mean) self._stddev = params.real_vector(stddev, dimensions=len(self._mean), domain=(0, np.inf))
def two_sample_cumulative_distribution_function_statistic( sample_a, sample_b, f=lambda p, t: np.square(p - t), g=lambda s, w: np.sum(s * w)): r"""Compute a statistic of the difference between two empirical cumulative distribution functions. Calculate statistics of the cumulative distribution functions (CDF) of two samples. Let $x_1,\ldots,x_d$ be the union of the two samples, $x_i < x_{i+1}$, and let $w_i = x_{i+1}-x_i$, $i = 1,\ldots,d-1$ be the differences between them. The calculated statistics have the form $g(s,w)$ where $s_i = f(F_a(x_i), F_b(x_i))$) and $F_a$, $F_b$ are the CDFs of the two samples. Here, the $x_i$ are the points where one or both of the CDFs changes, $f$ is a statistic that depends on the value of the two CDFs, and $g$ is an arbitrary function of $s$ and $w$. The default choice for $g$ is Riemann integration; as the CDFs are step functions, this is exact and leads to statistics of the form \[ \int_{-\infty}^{\infty} f(F_a(x),F_b(x)) dx . \] Parameters: sample_a: first sample; a sequence of real numbers sample_b: second sample; a sequence of real numbers; can be of different length than first sample f: function accepting two same-length real vectors, returning a real vector of same length. This function computes a value that depends only on the two CDFs, and is thus constant between change points. The default is the squared difference, f(a,b) = np.square(a-b). The convention here is to use the left endpoint of the "steps". g: function accepting two same-length real vectors, returning a real number. Computes the statistic based on values of f and step "widths". The default, g(s,w) = np.sum(g * w), performs Riemann integration. """ sample_a = params.real_vector(sample_a) sample_b = params.real_vector(sample_b) allx = np.union1d(sample_a, sample_b) # all x where F_a and F_b change xdif = np.ediff1d(allx) # width of Riemann integration bars allx = allx.reshape((len(allx), 1)) cdfa = np.count_nonzero(np.sort(sample_a) <= allx, axis=1) / len(sample_a) cdfb = np.count_nonzero(np.sort(sample_b) <= allx, axis=1) / len(sample_b) stat = np.asfarray(f(cdfa, cdfb)) return g(stat[:-1], xdif)
def __init__(self, mean, **kwargs): """Initialize state. Parameters: mean: sequence of means (floats) """ super().__init__(**kwargs) self._mean = params.real_vector(mean)
def apply(self, dist: PredictiveDistribution) -> Sequence[float]: """Calculate the likelihood of the given distribution improving on the target value. This currently only works for normal distributions. To extend to non-normal distributions, we should have the `PredictiveDistribution` class expose a `cdf()` method. Parameters: dist: a univariate predictive distribution Returns: The probability mass of the distribution that is above/below the target (depending on if the goal is to maximize or minimize) """ mean = params.real_vector(dist.mean) stddev = params.real_vector(dist.stddev, dimensions=len(mean), domain=(0, np.inf)) # If the goal is to minimize, negate the target and the mean value. # Then, calculate the likelihood of improvement assuming maximization. target = self._target * self._direction mean = mean * self._direction return np.asfarray([self._calculate_li_above(m, s, target) for m, s in zip(mean, stddev)])
def shaded_line( self, positions: np.ndarray, values: List[np.ndarray], color_idx: int = 0, label: Optional[str] = None, quantile_width: float = 0.5, alpha: float = 0.2, show_extrema: bool = True, **kwargs, ): """Draw a line plot with shaded quantiles. Parameters: positions: 1-d array of point locations on the horizontal axis values: list of arrays, each one containing all of the values at a given location. len(values) must equal len(positions) color_idx: color index label: line label quantile_width: fraction of the range to shade. For the default value, 0.5, shade from the 25th percentile to the 75th percentile. alpha: shading alpha level show_extrema: whether or not to draw dashed lines at the best/worst point """ positions = params.real_vector(positions) values = params.tuple_(values, params.real_vector, arity=len(positions)) color_idx = params.integer(color_idx, from_=0, below=len(self.configuration.color_set)) quantile_width = params.real(quantile_width, from_=0, to=1) alpha = params.real(alpha, from_=0, to=1) color = self.configuration.color(color_idx) lower_bound = 0.5 - quantile_width / 2.0 upper_bound = 0.5 + quantile_width / 2.0 median = [np.median(samples) for samples in values] lower_shading = [np.quantile(samples, lower_bound) for samples in values] upper_shading = [np.quantile(samples, upper_bound) for samples in values] self.ax.plot(positions, median, linestyle="-", color=color, label=label, **kwargs) self.ax.fill_between( positions, lower_shading, upper_shading, color=color, alpha=alpha, **kwargs, ) if show_extrema: min_val = [np.min(samples) for samples in values] max_val = [np.max(samples) for samples in values] self.ax.plot(positions, min_val, linestyle="--", color=color, **kwargs) self.ax.plot(positions, max_val, linestyle="--", color=color, **kwargs)
def __init__(self, mean, stddev, corr, **kwargs): """Initialize state. The correlated normal distribution is completely characterized by its mean, standard deviations, and correlation matrix. Parameters: mean: a sequence of means (floats) stddev: a sequence of standard deviations (non-negative floats) corr: a matrix of Pearson correlations between individual predictions (floats between 0 and 1) """ super().__init__(**kwargs) self._mean = params.real_vector(mean) self._stddev = params.real_vector(stddev, dimensions=len(self._mean), domain=(0, np.inf)) self._corr = params.real_matrix(corr, nrows=len(self._mean), ncols=len(self._mean))
def test_real_vector_1(): """Tests real vectors.""" assert np.array_equal(params.real_vector([1]), np.asfarray([1])) assert np.array_equal(params.real_vector([1, 2], dimensions=2), np.asfarray([1, 2])) assert np.array_equal( params.real_vector([1, 2], dimensions=2, domain=[0, 3]), np.asfarray([1, 2]) ) assert np.array_equal( params.real_vector([1, 2], domain=[[0.5, 1.5], [0, 3]]), np.asfarray([1, 2]) ) with pytest.raises(InvalidParameterError): params.real_vector([1, 2], dimensions=3) with pytest.raises(InvalidParameterError): params.real_vector([1, 2], domain=[0, 1.5])
def fit(self, data: Data) -> "RandomForestRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "GaussianProcessRegressionSklearn": """Fits the model using training data. Parameters: data: labeled data to train on; must derive from IndexedData and LabeledData Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "RandomForestRegressionLolo": """Fits the model using training data. Parameters: data: labeled tabular data to train on Returns: self (allows chaining) """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) try: self._model.fit(xtrain, ytrain) except Py4JJavaError as e: raise BenchmarkError("training lolo model failed") from e return self