class KdeSampler: """Kernel Density Estimation based sampler.""" def __init__(self, sample=None, dim=2, n_sample_bin=1000, bw=1): """Sampler creation. A large bin of sample is used to draw new sample from using KDE. :param array_like sample: Sample to start from, shape (n_samples, n_features). :param int dim: Dimension of the parameter space. :param int n_sample_bin: Number of sample of the bin. :param float bw: Bandwidth of the KDE. """ self.dim = dim if sample is None: self.space = [np.random.random_sample(self.dim)] else: self.space = sample self.n_samples = len(self.space) self.bw = bw self.bounds = np.array([[0] * self.dim, [1] * self.dim]) self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bw, metric='pyfunc', rtol=1e-4, metric_params={'func': self.metric_func}) self.kde.fit(self.space) self.kde_ = copy.deepcopy(self.kde) # dists = [ot.Uniform(0, 1) for _ in range(self.dim)] # dists = ot.ComposedDistribution(dists) # lhs = ot.LHSExperiment(dists, n_sample_bin, True, True) # self.space_bin = np.array(lhs.generate()) # # self.space_bin = np.array(ot.LowDiscrepancySequence(ot.SobolSequence(self.dim)).generate(n_sample_bin)) # self.idx = list(range(n_sample_bin)) def metric_func(self, x, other): """Inverse of Minkowsky with p=0.5.""" p = 0.5 # Bounds exclusion mask = np.logical_and(x >= self.bounds[0], x <= self.bounds[1]) if not np.all(mask): return 0 # Non-rectangular domain # if not 0.5 < np.sum(x) < 1: # return 0 # Minkowsky dist = np.sum(abs(x - other)**p)**(1. / p) # Euclidean # dist = np.linalg.norm(x - other) # background = np.linalg.norm(x[1] - 0.8) # dist = 0 # dist *= 1 / (background ) * 0.05 # LHS constrain # if np.linalg.norm(x - other, -np.inf) <= 0.03 / (self.n_samples + 1): # return 0 # LHS + Limit influence # if (np.linalg.norm(x - other, -np.inf) <= 0.03 / (self.n_samples + 1)) and \ # (np.linalg.norm(x - other) <= 0.5 / (self.n_samples + 1) ** (1 / len(self.bounds[0]))): # return 0 return dist def pdf(self, x, kernel='gaussian'): """Scale PDF between 0 and 1.""" pdf_base = np.exp(self.kde.score_samples(x)) sigma_fin = self.bw / self.n_samples**(1 / self.dim) pdf = 1 - (2 * np.pi)**( self.dim / 2) * sigma_fin**self.dim * pdf_base * self.n_samples # gaussian # pdf = 1 - np.pi * sigma_fin ** 2 * f * self.n_samples # tophat pdf[np.where(pdf < 0)] = 0 return pdf def sample_kde(self, n_samples=1): """Generate random samples from the model. :param int n_samples: Number of samples to generate. :param return: List of samples. :rtype: array_like, shape (n_samples, n_features) """ # proba = np.exp(self.kde.score_samples(self.space_bin)) # proba = self.pdf(self.space_bin) # proba /= np.sum(proba) # idx = np.random.choice(self.idx, size=n_samples, p=proba) # return np.atleast_2d(self.space_bin[idx]) def metropolis_accept(old, new): return np.log(np.random.uniform()) < new - old def proposal(x): lower, upper = -0.1, 1.1 sigma = 0.3 return np.array([ truncnorm.rvs((lower - xi) / sigma, (upper - xi) / sigma, loc=xi, scale=sigma) for xi in x ]).reshape(1, -1) def metropolis(logp, n_samples, init): old = proposal(init) samples = [] while len(samples) < n_samples: new = proposal(old) logp_old = logp(old) logp_new = logp(new) if metropolis_accept(logp_old, logp_new): old = new logp_old = logp_new if np.exp(logp_old) > 0: samples.append(old) samples = np.atleast_2d(samples)[:n_samples].reshape(n_samples, -1) return samples # Restart # samples = np.random.random(self.bounds.shape[1]).reshape(1, -1) # while len(samples) < n_samples: # samples_ = metropolis(self.kde.score_samples, n_samples // 1, # np.random.random(self.bounds.shape[1])) # samples = np.concatenate([samples, samples_]) # samples = metropolis(self.kde.score_samples, n_samples, # np.random.random(self.bounds.shape[1])) with np.errstate(divide='ignore', invalid='ignore'): samples = metropolis(lambda x: np.log(self.pdf(x)), n_samples, np.random.random(self.bounds.shape[1])) return samples def generate(self, n_samples=2): """Generate samples. Using the KDE, generate new samples following the PDF. The sample giving the best improvement in terms of discrepancy is kept. Update the KDE after each sample is added to the sampling. :param int n_samples: Number of samples to generate. :return: Sample. :rtype: array_like, shape (n_samples, n_features) """ self.kde = copy.deepcopy(self.kde_) sample = list(copy.deepcopy(self.space)) self.n_samples = len(sample) for _ in range(n_samples - 1): sample_ = self.sample_kde(500) self.sample_ = sample_ self.kde_prev = copy.deepcopy(self.kde) # Normal strategy # disc = [ot.SpaceFillingPhiP(1000).evaluate(np.vstack([sample, s])) # for s in sample_] # disc = [Space.discrepancy(np.vstack([sample, s]), method='WD') # for s in sample_] disc = [ ot.SpaceFillingC2().evaluate(np.vstack([sample, s])) for s in sample_ ] # Subprojections # disc = [discrepancy_2D(np.vstack([sample, s])) # for s in sample_] # Sobol consideration # disc = [ot.SpaceFillingC2().evaluate(np.concatenate([np.array(sample)[:, 0].reshape(-1, 1), np.array(s)[0].reshape(1, 1)])) # for s in sample_] sample.append(sample_[np.argmin(disc)]) # For constrain # disc = [ot.SpaceFillingMinDist().evaluate(np.vstack([sample, s])) # for s in sample_] # Max probability point # disc = self.kde_.score_samples(sample_) # sample.append(sample_[np.argmax(disc)]) self.n_samples = len(sample) self.kde.set_params(bandwidth=self.bw / self.n_samples**(1 / 2), metric_params={'func': self.metric_func}) self.kde.fit(sample) return np.array(sample)
class KDEestimator: """ An interface for generating random numbers according to a given Kernel Density Estimation (KDE) parametrization based on the data. """ def __init__(self, bandwidth=1.0): from sklearn.neighbors.kde import KernelDensity self.bandwidth = bandwidth self.model = KernelDensity(bandwidth=self.bandwidth) def _botev_fixed_point(self, t, M, I, a2): # Find the largest float available for this numpy if hasattr(np, 'float128'): large_float = np.float128 elif hasattr(np, 'float96'): large_float = np.float96 else: large_float = np.float64 l = 7 I = large_float(I) M = large_float(M) a2 = large_float(a2) f = 2 * np.pi**(2 * l) * np.sum(I**l * a2 * np.exp(-I * np.pi**2 * t)) for s in range(l, 1, -1): K0 = np.prod(np.arange(1, 2 * s, 2)) / np.sqrt(2 * np.pi) const = (1 + (1 / 2)**(s + 1 / 2)) / 3 time = (2 * const * K0 / M / f)**(2 / (3 + 2 * s)) f = 2 * np.pi ** (2 * s) * \ np.sum(I ** s * a2 * np.exp(-I * np.pi ** 2 * time)) return t - (2 * M * np.sqrt(np.pi) * f)**(-2 / 5) def finite(self, val): """ Checks if a value is finite or not """ return val is not None and np.isfinite(val) def botev_bandwidth(self, data): """ Implementation of the KDE bandwidth selection method outline in: Z. I. Botev, J. F. Grotowski, and D. P. Kroese. *Kernel density estimation via diffusion.* The Annals of Statistics, 38(5):2916-2957, 2010. Based on the implementation of Daniel B. Smith, PhD. The object is a callable returning the bandwidth for a 1D kernel. Forked from the package `PyQT_fit <https://code.google.com/archive/p/pyqt-fit/>`_. :param data: 1D array containing the data to model with a 1D KDE. :type data: numpy.ndarray :returns: Optimal bandwidth according to the data. """ from scipy import fftpack, optimize # def __init__(self, N=None, **kword): # if 'lower' in kword or 'upper' in kword: # print("Warning, using 'lower' and 'upper' for botev bandwidth is " # "deprecated. Argument is ignored") # self.N = N # # def __call__(self, data):#, model): # """ # Returns the optimal bandwidth based on the data # """ N = 2**10 #if self.N is None else int(2 ** np.ceil(np.log2(self.N))) # lower = getattr(model, 'lower', None) # upper = getattr(model, 'upper', None) # if not finite(lower) or not finite(upper): minimum = np.min(data) maximum = np.max(data) span = maximum - minimum lower = minimum - span / 10 #if not finite(lower) else lower upper = maximum + span / 10 #if not finite(upper) else upper # Range of the data span = upper - lower # Histogram of the data to get a crude approximation of the density # weights = model.weights # if not weights.shape: weights = None M = len(data) DataHist, bins = np.histogram(data, bins=N, range=(lower, upper), weights=weights) DataHist = DataHist / M DCTData = fftpack.dct(DataHist, norm=None) I = np.arange(1, N, dtype=int)**2 SqDCTData = (DCTData[1:] / 2)**2 guess = 0.1 try: t_star = optimize.brentq(self._botev_fixed_point, 0, guess, args=(M, I, SqDCTData)) except ValueError: t_star = .28 * N**(-.4) return np.sqrt(t_star) * span def fit(self, x): self.bandwidth = self.botev_bandwidth(x.flatten()) self.model.set_params(**{'bandwidth': self.bandwidth}) self.model.fit(x.reshape(-1, 1)) def sample(self, dimension=1.0): return self.model.sample(dimension) def pdf(self, x): return self.model.score_samples(x)