def test_midpoints_log(self): print("\n|Test_Utils:test_midpoints_log()|") test = [[1e0, 1e1, 1e2, 1e3], [1e2, 1e3, 1e4, 1e5]] aa = np.sqrt(10.0) truth = [[1e1, 1e2, 1e3, 1e4], [[aa * 1e0, aa * 1e1, aa * 1e2], [aa * 1e2, aa * 1e3, aa * 1e4]]] for ii, tr in enumerate(truth): vals = utils.midpoints(test, 'log', axis=ii) assert_true(np.all(np.shape(tr) == np.shape(vals))) assert_true(np.allclose(tr, vals)) shp = (4, 5) test_log = np.random.uniform(-2.0, 2.0, np.product(shp)).reshape(shp) test_lin = 10**test_log for ii in range(2): # Make sure `midpoints` gives consistent results itself vals_log = utils.midpoints(test_log, 'lin', axis=ii) vals_lin = utils.midpoints(test_lin, 'log', axis=ii) assert_true(np.all(np.shape(vals_log) == np.shape(vals_lin))) assert_true(np.allclose(10**vals_log, vals_lin)) # Compare log-midpoint to known values temp = np.moveaxis(test_lin, ii, 0) temp = np.log10(temp) true = temp[:-1, :] + 0.5 * np.diff(temp, axis=0) true = np.moveaxis(true, 0, ii) true = 10**true assert_true(np.all(np.shape(true) == np.shape(vals_lin))) assert_true(np.allclose(true, vals_lin)) return
def test_reflect_2d(self): print("\n|Test_KDE_Resample:test_reflect_2d()|") seed = np.random.randint(int(1e4)) seed = 8067 print(seed) np.random.seed(seed) NUM = 2000 xx = np.random.uniform(0.0, 2.0, NUM) yy = np.random.normal(1.0, 1.5, NUM) yy = yy[yy < 2.0] yy = np.concatenate([yy, np.random.choice(yy, NUM - yy.size)]) data = [xx, yy] edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]] egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges] cgrid = [utils.midpoints(ee, 'lin') for ee in egrid] # width = [np.diff(ee) for ee in egrid] xc, yc = np.meshgrid(*cgrid, indexing='ij') # grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=egrid, density=True) kde = kale.KDE(data) reflections = [[[0.0, 2.0], [None, 2.0]], [[0.0, 2.0], None], [None, [None, 2.0]], None] for jj, reflect in enumerate(reflections): samps_ref = kde.resample(reflect=reflect) samps_nrm = kde.resample() if reflect is None: continue for ii, ref in enumerate(reflect): if ref is None: continue if ref[0] is None: ref[0] = -np.inf if ref[1] is None: ref[1] = np.inf print(jj, ii, ref) for kk, zz in enumerate([samps_nrm[ii], samps_ref[ii]]): inside = (ref[0] < zz) & (zz < ref[1]) outside = ((zz < ref[0]) | (ref[1] < zz)) print("\tin : ", kk, np.all(inside), np.any(inside)) print("\tout: ", kk, np.all(outside), np.any(outside)) if kk == 0: assert_false(np.all(inside)) assert_true(np.any(outside)) else: assert_true(np.all(inside)) assert_false(np.any(outside)) return
def test_different_bws(self): print("\n|Test_KDE_Resample:test_different_bws()|") np.random.seed(9235) NUM = 1000 a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) bb = np.random.normal(3.0, 0.02, NUM) + aa / 100 data = [aa, bb] edges = [utils.spacing(dd, 'lin', 100, stretch=1.0) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') bws = [0.5, 2.0] kde2d = kale.KDE(data, bandwidth=bws) kde1d = [kale.KDE(dd, bandwidth=ss) for dd, ss in zip(data, bws)] for ii in range(2): samp_1d = kde1d[ii].resample(NUM).squeeze() samp_2d = kde2d.resample(NUM)[ii] # Make sure the two distributions resemble eachother ks, pv = sp.stats.ks_2samp(samp_1d, samp_2d) # Calibrated to the above seed-value of `9235` print("{}, pv = {}".format(ii, pv)) assert_true(pv > 0.05) return
def pdf_params_fixed_bandwidth(self, kernel): print("\n|Test_KDE_PDF:pdf_params_fixed_bandwidth()|") np.random.seed(124) NUM = 1000 bandwidth = 0.02 sigma = [2.5, 1.5] corr = 0.9 s2 = np.square(sigma) cc = corr * sigma[0] * sigma[1] cov = [[s2[0], cc], [cc, s2[1]]] cov = np.array(cov) data = np.random.multivariate_normal([1.0, 2.0], cov, NUM).T sigma = [2.5, 0.5] corr = 0.0 s2 = np.square(sigma) cc = corr * sigma[0] * sigma[1] cov = [[s2[0], cc], [cc, s2[1]]] cov = np.array(cov) more = np.random.multivariate_normal([1.0, 6.0], cov, NUM).T data = np.concatenate([data, more], axis=-1) kde = kale.KDE(data, bandwidth=bandwidth, kernel=kernel) edges = [utils.spacing(dd, 'lin', 200, stretch=0.1) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] widths = [np.diff(ee) for ee in edges] # area = widths[0][:, np.newaxis] * widths[1][np.newaxis, :] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') # grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=edges, density=True) for par in range(2): xx = cents[par] pdf_2d = kde.density(xx, params=par, probability=True)[1] kde_1d = kale.KDE(data[par, :], bandwidth=bandwidth, kernel=kernel) pdf_1d = kde_1d.density(xx, probability=True)[1] # print("matrix : ", kde.bandwidth.matrix, kde_1d.bandwidth.matrix) print(f"pdf_1d = {utils.stats_str(pdf_1d)}") print(f"pdf_2d = {utils.stats_str(pdf_2d)}") assert_true(np.allclose(pdf_2d, pdf_1d, rtol=1e-3)) for pdf, ls, lw in zip([pdf_2d, pdf_1d], ['-', '--'], [1.5, 3.0]): tot = np.sum(pdf * widths[par]) print("tot = {:.4e}".format(tot)) assert_true(np.isclose(tot, 1.0, rtol=2e-2)) vals = [xx, pdf] if par == 1: vals = vals[::-1] return
def compare_scipy_2d(self, kernel): print("\n|Test_KDE_PDF:test_compare_scipy_2d()|") NUM = 1000 a1 = np.random.normal(6.0, 1.0, NUM//2) a2 = np.random.lognormal(0, 0.5, size=NUM//2) aa = np.concatenate([a1, a2]) bb = np.random.normal(3.0, 0.02, NUM) + aa/100 data = [aa, bb] edges = [utils.spacing(dd, 'lin', 30, stretch=0.5) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') grid = np.vstack([xc.ravel(), yc.ravel()]) methods = ['scott', 0.04, 0.2, 0.8] # classes = [sp.stats.gaussian_kde, kale.KDE] classes = [lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw), lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel)] for mm in methods: kdes_list = [] for cc in classes: try: test = cc(data, mm).density(grid, probability=True)[1].reshape(xc.shape).T except AttributeError: test = cc(data, mm).pdf(grid).reshape(xc.shape).T kdes_list.append(test) assert_true(np.allclose(kdes_list[0], kdes_list[1])) return
def reflect_2d(self, kernel): print("\n|Test_KDE_PDF:test_reflect_2d()|") np.random.seed(124) NUM = 1000 xx = np.random.uniform(0.0, 2.0, NUM) yy = np.random.normal(1.0, 1.0, NUM) yy = yy[yy < 2.0] yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)]) data = [xx, yy] edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]] egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges] cgrid = [utils.midpoints(ee, 'lin') for ee in egrid] width = [np.diff(ee) for ee in egrid] xc, yc = np.meshgrid(*cgrid, indexing='ij') grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=egrid, density=True) kde = kale.KDE(data, kernel=kernel) inside_test_func = np.all if kernel._FINITE == 'infinite' else np.any reflections = [ [[0.0, 2.0], [None, 2.0]], [[0.0, 2.0], None], [None, [None, 2.0]], None ] for jj, reflect in enumerate(reflections): pdf_1d = kde.density(grid, reflect=reflect, probability=True)[1] pdf = pdf_1d.reshape(hist.shape) inside = np.ones_like(pdf_1d, dtype=bool) if reflect is None: outside = np.zeros_like(pdf_1d, dtype=bool) else: outside = np.ones_like(pdf_1d, dtype=bool) for ii, ref in enumerate(reflect): if ref is None: ref = [-np.inf, np.inf] if ref[0] is None: ref[0] = -np.inf if ref[1] is None: ref[1] = np.inf inside = inside & (ref[0] < grid[ii]) & (grid[ii] < ref[1]) outside = outside & ((grid[ii] < ref[0]) | (ref[1] < grid[ii])) assert_true(inside_test_func(pdf_1d[inside] > 0.0)) assert_true(np.allclose(pdf_1d[outside], 0.0)) area = width[0][:, np.newaxis] * width[1][np.newaxis, :] prob_tot = np.sum(pdf * area) print(jj, reflect, "prob_tot = {:.4e}".format(prob_tot)) assert_true(np.isclose(prob_tot, 1.0, rtol=3e-2)) return
def test_midpoints_lin(self): print("\n|Test_Utils:test_midpoints_lin()|") test = [[0, 1, 2, 3], [2, 3, 4, 5]] truth = [[1, 2, 3, 4], [[0.5, 1.5, 2.5], [2.5, 3.5, 4.5]]] for ii, tr in enumerate(truth): vals = utils.midpoints(test, 'lin', axis=ii) assert_true(np.all(np.shape(tr) == np.shape(vals))) assert_true(np.all(tr == vals)) shp = (4, 5) test = np.random.uniform(-1.0, 1.0, np.product(shp)).reshape(shp) for ii in range(2): vals = utils.midpoints(test, 'lin', axis=ii) temp = np.moveaxis(test, ii, 0) true = temp[:-1, :] + 0.5 * np.diff(temp, axis=0) true = np.moveaxis(true, 0, ii) assert_true(np.all(np.shape(true) == np.shape(vals))) assert_true(np.allclose(true, vals)) return
def __init__(self, edges, dens, threshold=10.0, **kwargs): super().__init__(edges, dens, **kwargs) # Note: `dens` has already been converted from density to mass (i.e. integrating each cell) # this happened in `Sample_Grid.__init__()` ==> `Sample_Outliers._init_data()` # `data_edge` is still a density (at the corners of each cell) mass_outs = np.copy(self._mass) # We're only going to stochastically sample from bins below the threshold value # recalc `csum` zeroing out the values above threshold outs = (mass_outs > threshold) # print(f"Outside: {np.count_nonzero(outs)/outs.size:.4f}") # print(f"Inside : {np.count_nonzero(~outs)/outs.size:.4f}") mass_outs[outs] = 0.0 idx, csum = _data_to_cumulative(mass_outs, prefilter=False) self._idx = idx self._csum = csum # We'll manually sample bins above threshold, so store those for later mass_ins = np.copy(self._mass) mass_ins[~outs] = 0.0 # Find the center-of-mass of each cell (based on density corner values) coms = self.grid dens_edge = self._dens dens_cent = utils.midpoints(dens_edge, log=False, axis=None) coms = [ utils.midpoints(dens_edge * ll, log=False, axis=None) / dens_cent for ll in coms ] self._threshold = threshold self._mass_ins = mass_ins self._coms_ins = coms self._mass_outs = mass_outs return
def test_midpoints_axes(self): print("\n|Test_Utils:test_midpoints_axes()|") # NUM = 100 shp = (12, 14, 16) test = np.ones(shp) for ii in range(test.ndim): vals = utils.midpoints(test, 'lin', axis=ii) new_shape = np.array(shp) new_shape[ii] -= 1 assert_true(np.all(vals.shape == new_shape)) assert_true(np.all(vals == 1.0)) vals = utils.midpoints(test, 'log', axis=ii) new_shape = np.array(shp) new_shape[ii] -= 1 assert_true(np.all(vals.shape == new_shape)) assert_true(np.all(vals == 1.0)) test = np.arange(10) vals = utils.midpoints(test, 'lin') true = 0.5 * (test[:-1] + test[1:]) assert_true(np.allclose(vals, true)) return
def reflect_1d(self, kernel): print("\n|Test_KDE_PDF:reflect_1d()|") np.random.seed(124) NUM = 1000 EXTR = [0.0, 2.0] aa = np.random.uniform(*EXTR, NUM) egrid = utils.spacing(aa, 'lin', 2000, stretch=0.5) cgrid = utils.midpoints(egrid, 'lin') delta = np.diff(egrid) boundaries = [None, EXTR] for bnd in boundaries: kde = kale.KDE(aa, kernel=kernel) pdf = kde.density(cgrid, reflect=bnd, probability=True)[1] # If the kernel's support is infinite, then all points outside of boundaries should be # nonzero; if it's finite-supported, then only some of them (near edges) will be outside_test_func = np.all if kernel._FINITE == 'infinite' else np.any # Make sure unitarity is preserved tot = np.sum(pdf * delta) print("Boundary '{}', total = {:.4e}".format(bnd, tot)) assert_true(np.isclose(tot, 1.0, rtol=1e-3)) ratio_extr = np.max(pdf) / np.min(pdf[pdf > 0]) # No reflection, then non-zero PDF everywhere, and large ratio of extrema if bnd is None: assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0)) assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0)) assert_true(ratio_extr > 10.0) # No lower-reflection, nonzero values below 0.0 elif bnd[0] is None: assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0)) assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0)) # No upper-reflection, nonzero values above 2.0 elif bnd[1] is None: assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0)) assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0)) else: assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0)) assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0)) assert_true(ratio_extr < 2.0) return
def _grad_along(data_edge, dim): grad = np.diff(data_edge, axis=dim) nums = list(np.arange(grad.ndim)) nums.pop(dim) grad = utils.midpoints(grad, log=False, axis=nums) return grad
def sample(self, nsamp=None, interpolate=True, return_scalar=None): """Sample from the probability distribution. Arguments --------- nsamp : scalar or None interpolate : bool return_scalar : bool Returns ------- vals : (D, N) ndarray of scalar """ dens = self._dens scalar_dens = self._scalar_dens edges = self._edges ndim = self._ndim # ---- initialize parameters if interpolate and (dens is None): logging.info("`dens` is None, cannot interpolate sampling") interpolate = False # If no number of samples are given, assume that the units of `self._mass` are number of samples, and choose # the total numbe of samples to be the total of this if nsamp is None: nsamp = self._mass.sum() nsamp = int(nsamp) if return_scalar is None: return_scalar = (scalar_dens is not None) elif return_scalar and (scalar_dens is None): return_scalar = False logging.warning( "WARNING: no `scalar` initialized, but `return_scalar`=True!") # ---- Get generalized sampling locations # Choose random bins, proportionally to `mass`, and positions within bins (uniformly distributed) # `bin_numbers_flat` (N*D,) are the index numbers for bins in flattened 1D array of length N*D # `intrabin_locs` (D, N) are position [0.0, 1.0] within each bin for each sample in each dimension bin_numbers_flat, intrabin_locs = self._random_bins(nsamp) # Convert from flat (N,) indices into ND indices; (D, N) for D dimensions, N samples (`nsamp`) bin_numbers = np.unravel_index(bin_numbers_flat, self._shape_bins) # If scalars are also being sampled: find scalar value for bin centers (i.e. bin averages) # this will be updated/improved if `interpolation=True` if return_scalar: scalar_mass = self._scalar_mass scalar_values = scalar_mass[bin_numbers] # ---- Place samples in each dimension vals = np.zeros_like(intrabin_locs) for dim, (edge, bidx) in enumerate(zip(edges, bin_numbers)): # Width of bins in this dimension wid = np.diff(edge) # Random location, in this dimension, for each bin. Relative position, i.e. between [0.0, 1.0] loc = intrabin_locs[dim] # Uniform / no-interpolation :: random-uniform within each bin if (not interpolate): vals[dim, :] = edge[bidx] + wid[bidx] * loc # Interpolated :: random-linear proportional to bin gradients (i.e. slope across bin in each dimension) else: # Calculate normalization for gradients; needs to be done for each dimension specifically # This normalization is needed to ensure that the pdf values are unitary when integrating in each dim norm = utils.trapz_dens_to_mass(dens, edges, axis=dim) others = np.arange(ndim).tolist() others.pop(dim) norm = utils.midpoints(norm, axis=others) edge = np.asarray(edge) # Find the gradient along this dimension (using center-values in other dimensions) grad = _grad_along(dens, dim) / norm # get the gradient for each sample grad = grad.flat[bin_numbers_flat] * wid[bidx] # interpolate edge values in this dimension (returns values [0.0, 1.0]) temp = _intrabin_linear_interp(loc, grad) # convert from intrabin positions to overall positions by linearly rescaling vals[dim, :] = edge[bidx] + temp * wid[bidx] # interpolate scalar values also if return_scalar and interpolate: grad = _grad_along(scalar_dens, dim) grad = grad.flat[bin_numbers_flat] # shift `loc` (location within bin) to center point scalar_values += grad * (loc - 0.5) if return_scalar: return vals, scalar_values return vals