def _init_data(self): """Override `Sample_Grid._init_data()` to avoid calculating `idx` and `csum`, not needed yet """ if self._mass is None: self._mass = utils.trapz_dens_to_mass(self._dens, self._edges, axis=None) if (self._scalar_mass is None) and (self._scalar_dens is not None): self._scalar_mass = utils.trapz_dens_to_mass(self._scalar_dens, self._edges, axis=None) return
def _init_data(self): if self._mass is None: self._mass = utils.trapz_dens_to_mass(self._dens, self._edges, axis=None) if (self._scalar_mass is None) and (self._scalar_dens is not None): self._scalar_mass = utils.trapz_dens_to_mass(self._scalar_dens, self._edges, axis=None) idx, csum = _data_to_cumulative(self._mass) self._idx = idx self._csum = csum return
def _test_sample(self, kernel): kern = kernel() NUM = int(1e6) bw = 1.0 pad = 4.0 xe, xc, dx = kale.utils.bins(-pad * bw, pad * bw, 100) samp = kern.sample(NUM) hist, _ = np.histogram(samp, xe, density=True) pdf = kern.evaluate(xc) cum_pdf = utils.trapz_dens_to_mass(pdf, xc) cum_pdf = np.cumsum(cum_pdf) cum_pdf = np.append([0.0], cum_pdf) cdf = kern.cdf(xc) # Compare 'analytic' PDF/CDF with distribution of samples # CDF tend not to match as well, so use larger tolerance for aa, bb, name, tol in zip([hist, cum_pdf], [pdf, cdf], ['pdf', 'cdf'], [1e-2, 1e-1]): idx = (aa > 0.0) & (bb > 0.0) dof = np.count_nonzero(idx) - 1 x2 = np.sum(np.square(aa[idx] - bb[idx]) / bb[idx]**2) x2 = x2 / dof print("Distribution: {} :: {} : x2/dof = {:.4e}".format( kern.name(), name, x2)) print("\t" + kale.utils.array_str(aa[idx])) print("\t" + kale.utils.array_str(bb[idx])) utils.alltrue(x2 < tol) return
def _test_ndim_a2(self, ndim): from kalepy import utils BIN_SIZE_RANGE = [10, 30] num_bins = np.random.randint(*BIN_SIZE_RANGE, ndim) edges = [] for nb in num_bins: ee = np.cumsum(np.random.uniform(0.0, 2.0, nb)) edges.append(ee) grid = np.meshgrid(*edges, indexing='ij') shp = np.array([len(ee) for ee in edges]) for axis in np.ndindex(*([ndim] * 2)): if len(np.unique(axis)) != len(axis): continue axis = np.asarray(axis) not_axis = np.array(list(set(range(ndim)) - set(axis))) print("\nndim = {}, axis = {}, other = {}".format( ndim, axis, not_axis)) bcast_norm = [np.newaxis for ii in range(ndim)] for na in not_axis: bcast_norm[na] = slice(None) bcast_norm = tuple(bcast_norm) norm = np.random.uniform(0.0, 10.0, shp[not_axis])[bcast_norm] widths = [] for ii in range(ndim): dim_len_inn = shp[ii] if ii in axis: wid = np.diff(edges[ii]) else: wid = np.ones(dim_len_inn) # Create new axes along all by the current dimension, slice along the current dimension cut = [np.newaxis for ii in range(ndim)] cut[ii] = slice(None) temp = wid[tuple(cut)] widths.append(temp) wids = np.product(np.array(widths, dtype=object), axis=0).astype(float) pdf = np.ones_like(grid[0]) * norm pmf = utils.trapz_dens_to_mass(pdf, edges, axis=axis) new_shp = [ss for ss in shp] for aa in axis: new_shp[aa] -= 1 utils.alltrue( np.shape(pmf) == np.array(new_shp), "Output shape is {fail:}correct") utils.alltrue(pmf == norm * wids, 'Values do {fail:}match') return
def _test_ndim_a1(self, ndim): from kalepy import utils BIN_SIZE_RANGE = [10, 30] num_bins = np.random.randint(*BIN_SIZE_RANGE, ndim) # num_bins = [3, 4] edges = [] for nb in num_bins: ee = np.cumsum(np.random.uniform(0.0, 2.0, nb)) edges.append(ee) grid = np.meshgrid(*edges, indexing='ij') shp = [len(ee) for ee in edges] for axis in range(ndim): not_axis = (axis + 1) % ndim print("\nndim = {}, axis = {}, other = {}".format( ndim, axis, not_axis)) bcast_norm = [np.newaxis for ii in range(ndim)] bcast_norm[not_axis] = slice(None) bcast_norm = tuple(bcast_norm) norm = np.random.uniform(0.0, 10.0, shp[not_axis])[bcast_norm] bcast_wids = [np.newaxis for ii in range(ndim)] bcast_wids[axis] = slice(None) bcast_wids = tuple(bcast_wids) wids = np.diff(edges[axis])[bcast_wids] pdf = np.ones_like(grid[0]) * norm pmf = utils.trapz_dens_to_mass(pdf, edges, axis=axis) new_shp = [ss for ss in shp] new_shp[axis] -= 1 utils.alltrue( np.shape(pmf) == np.array(new_shp), "Output shape is {fail:}correct") utils.alltrue(pmf == norm * wids, 'Values do {fail:}match') # print(pdf) # print(wids) # print(pmf) return
def _test_ndim(self, ndim): from kalepy import utils print("`ndim` = {}".format(ndim)) BIN_SIZE_RANGE = [10, 30] extr = [[0.0, np.random.uniform(0.0, 2.0)] for ii in range(ndim)] norm = np.random.uniform(0.0, 10.0) # extr = [[0.0, 1.0] for ii in range(ndim)] # norm = 1.0 edges = [ np.linspace(*ex, np.random.randint(*BIN_SIZE_RANGE)) for ex in extr ] grid = np.meshgrid(*edges, indexing='ij') lengths = np.max(extr, axis=-1) xx = np.min(np.moveaxis(grid, 0, -1) / lengths, axis=-1) pdf = norm * xx area = np.product(lengths) pmf = utils.trapz_dens_to_mass(pdf, edges) # Known area of a pyramid in ndim vol = area * norm / (ndim + 1) tot = np.sum(pmf) print("Volume = {:.4e}, Total Mass = {:.4e}; ratio = {:.4e}".format( vol, tot, tot / vol)) utils.allclose(vol, tot, rtol=1e-2, msg="total volume does {fail:}match analytic value") test = utils.trapz_nd(pdf, edges) print("Volume = {:.4e}, Total Mass = {:.4e}; ratio = {:.4e}".format( test, tot, tot / test)) utils.allclose(vol, tot, rtol=1e-2, msg="total volume does {fail:}match `trapz_nd` value") return
def sample(self, nsamp=None, interpolate=True, return_scalar=None): """Sample from the probability distribution. Arguments --------- nsamp : scalar or None interpolate : bool return_scalar : bool Returns ------- vals : (D, N) ndarray of scalar """ dens = self._dens scalar_dens = self._scalar_dens edges = self._edges ndim = self._ndim # ---- initialize parameters if interpolate and (dens is None): logging.info("`dens` is None, cannot interpolate sampling") interpolate = False # If no number of samples are given, assume that the units of `self._mass` are number of samples, and choose # the total numbe of samples to be the total of this if nsamp is None: nsamp = self._mass.sum() nsamp = int(nsamp) if return_scalar is None: return_scalar = (scalar_dens is not None) elif return_scalar and (scalar_dens is None): return_scalar = False logging.warning( "WARNING: no `scalar` initialized, but `return_scalar`=True!") # ---- Get generalized sampling locations # Choose random bins, proportionally to `mass`, and positions within bins (uniformly distributed) # `bin_numbers_flat` (N*D,) are the index numbers for bins in flattened 1D array of length N*D # `intrabin_locs` (D, N) are position [0.0, 1.0] within each bin for each sample in each dimension bin_numbers_flat, intrabin_locs = self._random_bins(nsamp) # Convert from flat (N,) indices into ND indices; (D, N) for D dimensions, N samples (`nsamp`) bin_numbers = np.unravel_index(bin_numbers_flat, self._shape_bins) # If scalars are also being sampled: find scalar value for bin centers (i.e. bin averages) # this will be updated/improved if `interpolation=True` if return_scalar: scalar_mass = self._scalar_mass scalar_values = scalar_mass[bin_numbers] # ---- Place samples in each dimension vals = np.zeros_like(intrabin_locs) for dim, (edge, bidx) in enumerate(zip(edges, bin_numbers)): # Width of bins in this dimension wid = np.diff(edge) # Random location, in this dimension, for each bin. Relative position, i.e. between [0.0, 1.0] loc = intrabin_locs[dim] # Uniform / no-interpolation :: random-uniform within each bin if (not interpolate): vals[dim, :] = edge[bidx] + wid[bidx] * loc # Interpolated :: random-linear proportional to bin gradients (i.e. slope across bin in each dimension) else: # Calculate normalization for gradients; needs to be done for each dimension specifically # This normalization is needed to ensure that the pdf values are unitary when integrating in each dim norm = utils.trapz_dens_to_mass(dens, edges, axis=dim) others = np.arange(ndim).tolist() others.pop(dim) norm = utils.midpoints(norm, axis=others) edge = np.asarray(edge) # Find the gradient along this dimension (using center-values in other dimensions) grad = _grad_along(dens, dim) / norm # get the gradient for each sample grad = grad.flat[bin_numbers_flat] * wid[bidx] # interpolate edge values in this dimension (returns values [0.0, 1.0]) temp = _intrabin_linear_interp(loc, grad) # convert from intrabin positions to overall positions by linearly rescaling vals[dim, :] = edge[bidx] + temp * wid[bidx] # interpolate scalar values also if return_scalar and interpolate: grad = _grad_along(scalar_dens, dim) grad = grad.flat[bin_numbers_flat] # shift `loc` (location within bin) to center point scalar_values += grad * (loc - 0.5) if return_scalar: return vals, scalar_values return vals