def KDE_statsmodels(data, kernel='gaussian'): data = [data.reshape(-1, 1), data.reshape(-1, 1) * 0.5] kde = sm.nonparametric.KDEMultivariate(data, var_type='cc') grid = cartesian([np.linspace(-7, 7, num=64), np.linspace(-7, 7, num=64)]) y = kde.pdf(grid) assert len(y) == 64 * 64 return y
def KDE_sklearn(data, kernel='gaussian'): if kernel == 'epa': kernel = 'epanechnikov' # instantiate and fit the KDE model kde = KernelDensity(bandwidth=1.0, kernel=kernel, rtol=1E-4) data = np.concatenate((data.reshape(-1, 1), data.reshape(-1, 1) * 0.5), axis=1) kde.fit(data) # score_samples returns the log of the probability density linspace = np.linspace(-7, 7, num=64) grid = cartesian([linspace, linspace]) logprob = kde.score_samples(grid) y = np.exp(logprob) assert len(y) == 64 * 64 return y
def evaluate(self, grid_points=None): """ Evaluate on equidistant grid points. Parameters ---------- grid_points: array-like, int, tuple or None A grid (mesh) to evaluate on. High dimensional grids must have shape (obs, dims). If an integer is passed, it's the number of grid points on an equidistant grid. If a tuple is passed, it's the number of grid points in each dimension. If None, a grid will be automatically created. Returns ------- y: array-like If a grid is supplied, `y` is returned. If no grid is supplied, a tuple (`x`, `y`) is returned. Examples -------- >>> kde = FFTKDE().fit([1, 3, 4, 7]) >>> # Three ways to evaluate a fitted KDE object: >>> x, y = kde.evaluate() # (1) Auto grid >>> x, y = kde.evaluate(256) # (2) Auto grid with 256 points >>> # (3) Use a custom grid (make sure it's wider than the data) >>> x_grid = np.linspace(-10, 25, num=2**10) # <- Must be equidistant >>> y = kde.evaluate(x_grid) # Notice that only y is returned """ # This method sets self.grid_points and verifies it super().evaluate(grid_points) # Extra verification for FFTKDE (checking the sorting property) if not grid_is_sorted(self.grid_points): raise ValueError("The grid must be sorted.") if isinstance(self.bw, numbers.Number) and self.bw > 0: bw = self.bw else: raise ValueError("The bw must be a number.") self.bw = bw # Step 0 - Make sure data points are inside of the grid min_grid = np.min(self.grid_points, axis=0) max_grid = np.max(self.grid_points, axis=0) min_data = np.min(self.data, axis=0) max_data = np.max(self.data, axis=0) if not ((min_grid < min_data).all() and (max_grid > max_data).all()): raise ValueError("Every data point must be inside of the grid.") # Step 1 - Obtaining the grid counts # TODO: Consider moving this to the fitting phase instead data = linear_binning(self.data, grid_points=self.grid_points, weights=self.weights) # Step 2 - Computing kernel weights g_shape = self.grid_points.shape[1] num_grid_points = np.array( list( len(np.unique(self.grid_points[:, i])) for i in range(g_shape))) num_intervals = num_grid_points - 1 dx = (max_grid - min_grid) / num_intervals # Find the real bandwidth, the support times the desired bw factor if self.kernel.finite_support: real_bw = self.kernel.support * self.bw else: # The parent class should compute this already. If not, compute # it again. This optimization only dominates a little bit with # few data points try: real_bw = self._kernel_practical_support except AttributeError: real_bw = self.kernel.practical_support(self.bw) # Compute L, the number of dx'es to move out from 0 in kernel L = np.minimum(np.floor(real_bw / dx), num_intervals + 1) assert (dx * L <= real_bw).all() # Evaluate the kernel once grids = [ np.linspace(-dx * L, dx * L, int(L * 2 + 1)) for (dx, L) in zip(dx, L) ] kernel_grid = cartesian(grids) kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm) # Reshape in preparation to kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L]) data = data.reshape(*tuple(num_grid_points)) # Step 3 - Performing the convolution # The following code block surpressed the warning: # anaconda3/lib/python3.6/site-packages/mkl_fft/_numpy_fft.py: # FutureWarning: Using a non-tuple sequence for multidimensional ... # output = mkl_fft.rfftn_numpy(a, s, axes) with warnings.catch_warnings(): warnings.simplefilter("ignore") ans = convolve(data, kernel_weights, mode="same").reshape(-1, 1) return self._evalate_return_logic(ans, self.grid_points)
def linbin_Ndim(data, grid_points, weights=None): """ d-dimensional linear binning, when d >= 2. With :math:`N` data points, and :math:`n` grid points in each dimension :math:`d`, the running time is :math:`O(N2^d)`. For each point the algorithm finds the nearest points, of which there are two in each dimension. Approximately 200 times faster than pure Python implementation. Parameters ---------- data : array-like The data must be of shape (obs, dims). grid_points : array-like Grid, where cartesian product is already performed. weights : array-like Must have shape (obs,). Examples -------- >>> from KDEpy.utils import autogrid >>> grid_points = autogrid(np.array([[0, 0, 0]]), num_points=(3, 3, 3)) >>> d = linbin_Ndim(np.array([[1.0, 0, 0]]), grid_points, None) """ data_obs, data_dims = data.shape assert len(grid_points.shape) == 2 assert data_dims >= 2 # Convert the data and grid points data = np.asarray_chkfinite(data, dtype=np.float) grid_points = np.asarray_chkfinite(grid_points, dtype=np.float) if weights is not None: weights = np.asarray_chkfinite(weights, dtype=np.float) weights = weights / np.sum(weights) if (weights is not None) and (data.shape[0] != len(weights)): raise ValueError('Length of data must match length of weights.') obs_tot, dims = grid_points.shape # Compute the number of grid points for each dimension in the grid grid_num = (grid_points[:, i] for i in range(dims)) grid_num = np.array(list(len(np.unique(g)) for g in grid_num)) # Scale the data to the grid min_grid = np.min(grid_points, axis=0) max_grid = np.max(grid_points, axis=0) num_intervals = (grid_num - 1) dx = (max_grid - min_grid) / num_intervals data = (data - min_grid) / dx # Create results result = np.zeros(grid_points.shape[0], dtype=np.float) # Call the Cython implementation. Loops are unrolled if d=1 or d=2, # and if d >= 3 a more general routine is called. It's a bit slower since # the loops are not unrolled. # Weighted data has two specific routines if weights is not None: if data_dims >= 3: binary_flgs = cartesian(([0, 1], ) * dims) result = cutils.iterate_data_ND_weighted(data, weights, result, grid_num, obs_tot, binary_flgs) else: result = cutils.iterate_data_2D_weighted(data, weights, result, grid_num, obs_tot) result = np.asarray_chkfinite(result, dtype=np.float) # Unweighted data has two specific routines too. This is because creating # uniform weights takes relatively long time. It's faster to have a # specialize routine for this case. else: if data_dims >= 3: binary_flgs = cartesian(([0, 1], ) * dims) result = cutils.iterate_data_ND(data, result, grid_num, obs_tot, binary_flgs) else: result = cutils.iterate_data_2D(data, result, grid_num, obs_tot) result = np.asarray_chkfinite(result, dtype=np.float) result = result / data_obs assert np.allclose(np.sum(result), 1) return result
def linbin_Ndim(data, grid_points, weights=None): """ 2 and 3-dimensional linear binning. With :math:`N` data points, and :math:`n` grid points in each dimension :math:`d`, the running time is :math:`O(N2^d)`. For each point the algorithm finds the nearest points, of which there are two in each dimension. Approximately 200 times faster than pure python implementation. Parameters ---------- data : array-like The data must be of shape (obs, dims). grid_points : array-like Grid, where cartesian product is already performed. weights : array-like Must have shape (obs,). Examples -------- >>> 1 + 1 2 """ data_obs, data_dims = data.shape assert len(grid_points.shape) == 2 assert data_dims >= 2 # Convert the data and grid points data = np.asarray_chkfinite(data, dtype=np.float) grid_points = np.asarray_chkfinite(grid_points, dtype=np.float) if weights is not None: weights = np.asarray_chkfinite(weights, dtype=np.float) weights = weights / np.sum(weights) if (weights is not None) and (data.shape[0] != len(weights)): raise ValueError('Length of data must match length of weights.') obs_tot, dims = grid_points.shape # Compute the number of grid points for each dimension in the grid grid_num = (grid_points[:, i] for i in range(dims)) grid_num = np.array(list(len(np.unique(g)) for g in grid_num)) # Scale the data to the grid min_grid = np.min(grid_points, axis=0) max_grid = np.max(grid_points, axis=0) num_intervals = (grid_num - 1) dx = (max_grid - min_grid) / num_intervals data = (data - min_grid) / dx # Create results result = np.zeros(grid_points.shape[0], dtype=np.float) # Call the Cython implementation if weights is not None: if data_dims >= 3: binary_flgs = cartesian(([0, 1], ) * dims) result = cutils.iterate_data_ND_weighted(data, weights, result, grid_num, obs_tot, binary_flgs) else: result = cutils.iterate_data_2D_weighted(data, weights, result, grid_num, obs_tot) result = np.asarray_chkfinite(result, dtype=np.float) else: if data_dims >= 3: binary_flgs = cartesian(([0, 1], ) * dims) result = cutils.iterate_data_ND(data, result, grid_num, obs_tot, binary_flgs) else: result = cutils.iterate_data_2D(data, result, grid_num, obs_tot) result = np.asarray_chkfinite(result, dtype=np.float) result = result / data_obs assert np.allclose(np.sum(result), 1) return result
def evaluate(self, grid_points=None): """ Evaluate on equidistant grid points. Parameters ---------- grid_points: array-like, int, tuple or None A grid (mesh) to evaluate on. High dimensional grids must have shape (obs, dims). If an integer is passed, it's the number of grid points on an equidistant grid. If a tuple is passed, it's the number of grid points in each dimension. If None, a grid will be automatically created. Returns ------- y: array-like If a grid is supplied, `y` is returned. If no grid is supplied, a tuple (`x`, `y`) is returned. Examples -------- >>> kde = FFTKDE().fit([1, 3, 4, 7]) >>> # Two ways to evaluate, either with a grid or without >>> x, y = kde.evaluate() >>> x, y = kde.evaluate(256) >>> y = kde.evaluate(x) """ # This method sets self.grid_points and verifies it super().evaluate(grid_points) if callable(self.bw): bw = self.bw(self.data) elif isinstance(self.bw, numbers.Number) and self.bw > 0: bw = self.bw else: raise ValueError('The bw must be a callable or a number.') self.bw = bw # Step 1 - Obtaining the grid counts data = linear_binning(self.data, grid_points=self.grid_points, weights=self.weights) # Step 2 - Computing kernel weights g_shape = self.grid_points.shape[1] num_grid_points = np.array( list( len(np.unique(self.grid_points[:, i])) for i in range(g_shape))) min_grid = np.min(self.grid_points, axis=0) max_grid = np.max(self.grid_points, axis=0) num_intervals = (num_grid_points - 1) dx = (max_grid - min_grid) / num_intervals # Find the real bandwidth, the support times the desired bw factor if self.kernel.finite_support: real_bw = self.kernel.support * self.bw else: # The parent class should compute this already. If not, compute # it again. This optimization only dominates a little bit with # few data points try: real_bw = self._kernel_practical_support except AttributeError: real_bw = self.kernel.practical_support(self.bw) # Compute L, the number of dx'es to move out from 0 in kernel L = np.minimum(np.floor(real_bw / dx), num_intervals + 1) assert (dx * L <= real_bw).all() # Evaluate the kernel once grids = [ np.linspace(-dx * L, dx * L, int(L * 2 + 1)) for (dx, L) in zip(dx, L) ] kernel_grid = cartesian(grids) kernel_weights = self.kernel(kernel_grid, bw=self.bw, norm=self.norm) # Reshape in preparation to kernel_weights = kernel_weights.reshape(*[int(k * 2 + 1) for k in L]) data = data.reshape(*tuple(num_grid_points)) # Step 3 - Performing the convolution evaluated = convolve(data, kernel_weights, mode='same').reshape(-1, 1) return self._evalate_return_logic(evaluated, self.grid_points)