def compare_scipy_1d(self, kernel): print("\n|Test_KDE_PDF:test_compare_scipy_1d()|") NUM = 100 a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) bins = utils.spacing([-1, 14.0], 'lin', 40) grid = utils.spacing(bins, 'lin', 3000) methods = ['scott', 0.04, 0.2, 0.8] classes = [ lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw), lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel) ] for mm in methods: kde_list = [] for cc in classes: try: test = cc(aa, mm).density(grid, probability=True)[1] except AttributeError: test = cc(aa, mm).pdf(grid) kde_list.append(test) print("method: {}".format(mm)) print("\t" + utils.stats_str(kde_list[0])) print("\t" + utils.stats_str(kde_list[1])) assert_true(np.allclose(kde_list[0], kde_list[1])) return
def ppf(self, cd): """Percentile Point Function - the inverse of the cumulative distribution function. NOTE: for symmetric kernels, this (effectively) uses points only with cdf in [0.0, 0.5], which produces better numerical results (unclear why). """ if self._ppf_func is None: x0, y0 = self.cdf_grid self._ppf_func = sp.interpolate.interp1d( y0, x0, kind='cubic', fill_value='extrapolate') # **self._INTERP_KWARGS) # Symmetry can be utilized to get better accuracy of results, see 'note' above if self.SYMMETRIC: cd = np.atleast_1d(cd) idx = (cd > 0.5) cd = np.copy(cd) cd[idx] = 1 - cd[idx] try: xx = self._ppf_func(cd) except ValueError: logging.error("`_ppf_func` failed!") logging.error("input `cd` = {} <=== {}".format( utils.stats_str(cd), utils.array_str(cd))) for vv in self.cdf_grid: logging.error("\tcdf_grid: {} <== {}".format( utils.stats_str(vv), utils.array_str(vv))) raise if self.SYMMETRIC: xx[idx] = -xx[idx] return xx
def test_resample_keep_params_1(self): print("\n|Test_KDE_Resample:test_resample_keep_params_1()|") np.random.seed(9235) NUM = int(1e3) # Construct some random data # ------------------------------------ a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(1.0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) # aa = a1 bb = np.random.normal(3.0, 0.02, aa.size) + aa / 100 data = [aa, bb] norm = 2.3 # Add an array of uniform values at location `ii`, make sure they are preserved in resample for ii in range(3): test = np.array(data) tt = norm * np.ones_like(test[0]) idx = np.random.choice(tt.size, tt.size // 2) tt[idx] *= -1 test = np.insert(test, ii, tt, axis=0) # Construct KDE kde3d = kale.KDE(test) # Resample from KDE preserving the uniform data samples = kde3d.resample(NUM, keep=ii) # Make sure the uniform values are still the same param_samp = samples[ii] assert_true( np.all( np.isclose(param_samp, norm) | np.isclose(param_samp, -norm))) # Make sure the other two parameters are consistent (KS-test) with input data samples = np.delete(samples, ii, axis=0) for jj in range(2): stuff = [samples[jj], data[jj]] ks, pv = sp.stats.ks_2samp(*stuff) msg = "{} {} :: {:.2e} {:.2e}".format(ii, jj, ks, pv) print("\t" + utils.stats_str(stuff[0])) print("\t" + utils.stats_str(stuff[1])) print(msg) assert_true(pv > 0.05) return
def _check_reflect(reflect, data, weights=None, helper=False): """Make sure the given `reflect` argument is valid given the data shape """ if reflect is None: return reflect if reflect is False: return None # NOTE: FIX: Should this happen in the method that calls `_check_reflect`? # data = np.atleast_2d(data) # ndim, nval = np.shape(data) data = np.asarray(data) ndim, nval = data.shape if reflect is True: reflect = [True for ii in range(ndim)] if (len(reflect) == 2) and (ndim == 1): reflect = np.atleast_2d(reflect) if (len(reflect) != ndim): # and not ((len(reflect) == 2) and (ndim == 1)): err = "`reflect` ({},) must match the data with shape ({}) parameters!".format( len(reflect), data.shape) raise ValueError(err) try: goods = [(ref is None) or (ref is True) or (len(ref) == 2) for ref in reflect] except TypeError as err: err = "Invalid `reflect` argument: Error: '{}'".format(err) raise ValueError(err) if not np.all(goods): err = "each row of `reflect` must be `None` or have shape (2,)! '{}'".format(reflect) raise ValueError(err) # Perform additional diagnostics for ii in range(ndim): if (reflect[ii] is True): reflect[ii] = [np.min(data[ii])*(1 - _NUM_PAD), np.max(data[ii])*(1 + _NUM_PAD)] elif (reflect[ii] is not None) and (True in reflect[ii]): if reflect[ii][0] is True: reflect[ii][0] = np.min(data[ii])*(1 - _NUM_PAD) if reflect[ii][1] is True: reflect[ii][1] = np.max(data[ii])*(1 + _NUM_PAD) if np.all(np.array(reflect[ii]) != None) and (reflect[ii][0] >= reflect[ii][1]): # noqa err = "Reflect is out of order: `reflect`[{}] = {} !".format(ii, reflect[ii]) raise ValueError(err) if helper: # Warn if any datapoints are outside of reflection bounds bads = utils.bound_indices(data[ii, :], reflect[ii], outside=True) if np.any(bads): if weights is None: frac = np.count_nonzero(bads) / bads.size else: frac = np.sum(weights[bads]) / np.sum(weights) msg = ( "A fraction {:.2e} of data[{}] ".format(frac, ii) + " are outside of `reflect` bounds!" ) logging.warning(msg) msg = ( "`reflect[{}]` = {}; ".format(ii, reflect[ii]) + "`data[{}]` = {}".format(ii, utils.stats_str(data[ii], weights=weights)) ) logging.warning(msg) logging.warning("I hope you know what you're doing.") return reflect
def _resample_reflect(self, data, size, reflect, weights=None, keep=None): """Resample the given data using reflection. """ matrix = self.matrix # Modify covariance-matrix for any `keep` dimensions matrix = utils.cov_keep_vars(matrix, keep, reflect=reflect) ndim, nvals = np.shape(data) # Actually 'reflect' (append new, mirrored points) around the given reflection points # Also construct bounding box for valid data data, bounds, weights = self._reflect_data(data, reflect, weights=weights) # Remove data points outside of kernels (or truncated region) data, weights = self._truncate_reflections(data, bounds, weights=weights) if (self._chunk is not None) and (self._chunk < size): num_chunks = int(np.ceil(size/self._chunk)) chunk_size = int(np.ceil(size/num_chunks)) else: chunk_size = size num_chunks = 1 # Draw randomly from the given data points, proportionally to their weights samps = np.zeros((size, ndim)) num_good = 0 cnt = 0 MAX = 10 draw = chunk_size fracs = [] while (num_good < size) and (cnt < MAX * num_chunks): # Draw candidate resample points # set `keep` to None, `matrix` is already modified to account for it trial = self._resample_clear(data, draw, weights=weights, matrix=matrix, keep=None) # Find the (boolean) indices of values within target boundaries idx = utils.bound_indices(trial, bounds) # Store good values to output array ngd = np.count_nonzero(idx) fracs.append(ngd/idx.size) if num_good + ngd <= size: samps[num_good:num_good+ngd, :] = trial.T[idx, :] else: ngd = (size - num_good) samps[num_good:num_good+ngd, :] = trial.T[idx, :][:ngd] # Increment counters num_good += ngd cnt += 1 # Next time, draw twice as many as we need draw = np.minimum(size - num_good, chunk_size) draw = (2**ndim) * draw draw = np.minimum(draw, int(self._chunk)) if num_good < size: err = "Failed to draw '{}' samples in {} iterations!".format(size, cnt) logging.error("") logging.error(err) logging.error("fracs = {}\n\t{}".format(utils.stats_str(fracs), fracs)) logging.error("Obtained {} samples".format(num_good)) logging.error("Reflect: {}".format(reflect)) logging.error("Bandwidths: {}".format(np.sqrt(self.matrix.diagonal().squeeze()))) logging.error("data = ") for dd in data: logging.error("\t{}".format(utils.stats_str(dd))) raise RuntimeError(err) samps = samps.T return samps