def test_cython_binning(self, dims, use_weights, eq_grid): """ Test the fast N-dimensional binning up against the naive Python implementation - using weights, no weights, equal grid values in each direction and unequal ones. """ num_points = 1000 data = np.random.randn(dims * num_points).reshape(num_points, dims) / 7 if use_weights: weights = np.random.randn(num_points) else: weights = None if eq_grid: num_points = (16,) * dims else: num_points = tuple([random.randint(8, 16) for i in range(dims)]) grid_points = autogrid(np.array([[0] * dims]), num_points=num_points) result = linear_binning(data, grid_points, weights=weights) result_slow = linbin_Ndim_python(data, grid_points, weights=weights) assert np.allclose(result, result_slow)
def evaluate(self, grid_points=None, bw_to_scalar=True): """ Evaluate the kernel density estimator on the grid points. Parameters ---------- grid_points : integer, tuple or array-like If an integer, the number of equidistant grid point in every dimension. If a tuple, the number of grid points in each dimension. If array-like, grid points of shape (obs, dims). """ if not hasattr(self, 'data'): raise ValueError('Must call fit before evaluating.') # -------------- Set up the bandwidth depending on inputs ------------- if isinstance(self.bw, (np.ndarray, Sequence)): if bw_to_scalar: bw = max(self.bw) else: bw = self.bw elif callable(self.bw): bw = self.bw(self.data) else: bw = self.bw self.bw = bw # -------------- Set up the grid depending on input ------------------- # If the grid None or an integer, use that in the autogrid method types_for_autogrid = (numbers.Integral, tuple) if grid_points is None or isinstance(grid_points, types_for_autogrid): self._user_supplied_grid = False bw_grid = self.kernel.practical_support(bw) grid_points = autogrid(self.data, bw_grid, grid_points) # Set it here, so as not to call kernel.practical_support(bw) again self._kernel_practical_support = bw_grid else: self._user_supplied_grid = True grid_points = self._process_sequence(grid_points) obs, dims = grid_points.shape if not obs > 0: raise ValueError('Grid must contain at least one data point.') self.grid_points = grid_points # Test quickly that the method has done what is was supposed to do if bw_to_scalar: assert isinstance(self.bw, numbers.Number) assert self.bw > 0 assert len(self.grid_points.shape) == 2
def test_binning_correctness_single_point(self, dims): """ Permute a single grid poind make sure that same point is weighted highly. """ eps = 10e-6 for subtest in range(25): data = np.random.randint(-2, 2, size=(1, dims)) - eps grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims) answer = linear_binning(data, grid_points) for grid_point, a in zip(grid_points, answer): diff = np.sum((grid_point - data.ravel())**2) if diff < eps: assert np.allclose(a, (1 - eps)**dims)
def test_binning_correctness_single_point(self, dims): """ Create a single data point that is close to a grid point on an integer grid. Compute linear binning, and test that the grid point that is close to the data point has a lot of weight assigned to it. """ eps = 10e-6 for subtest in range(25): # Create a data point between a random integer between -2 and 2 np.random.seed(subtest) data = np.random.randint(-2, 2, size=(1, dims)) - eps # Create grid points [-3, -2, -1, 0, 1, 2, 3]^dims grid_points = autogrid(np.array([[0] * dims]), num_points=(7,) * dims) # Compute the linear binning answer = linear_binning(data, grid_points) # At the grid point where data point is placed, # a large weight should be placed by the linear binning for grid_point, a in zip(grid_points, answer): diff = np.sum((grid_point - data.ravel()) ** 2) if diff < eps: assert np.allclose(a, (1 - eps) ** dims)
def improved_sheather_jones(data): """ The Improved Sheater Jones (ISJ) algorithm from the paper by Botev et al. This algorithm computes the optimal bandwidth for a gaussian kernel, and works very well for bimodal data (unlike other rules). The disadvantage of this algorithm is longer computation time, and the fact that this implementation does not always converge if very few data points are supplied. Understanding this algorithm is difficult, see: https://books.google.no/books?id=Trj9HQ7G8TUC&pg=PA328&lpg=PA328&dq= sheather+jones+why+use+dct&source=bl&ots=1ETdKd_6EF&sig=jZk4R515GB1xsn- VZVnjr-JfjSI&hl=en&sa=X&ved=2ahUKEwi1_czNncTcAhVGhqYKHaPiBtcQ6AEwA3oEC AcQAQ#v=onepage&q=sheather%20jones%20why%20use%20dct&f=false """ obs, dims = data.shape if not dims == 1: raise ValueError('ISJ is only available for 1D data.') n = 2**10 # Setting `percentile` higher decreases the chance of overflow xmesh = autogrid(data, boundary_abs=6, num_points=n, boundary_rel=0.5) data = data.ravel() xmesh = xmesh.ravel() # Create an equidistant grid R = np.max(data) - np.min(data) # dx = R / (n - 1) data = data.ravel() N = len(np.unique(data)) # Use linear binning to bin the data on an equidistant grid, this is a # prerequisite for using the FFT (evenly spaced samples) initial_data = linear_binning(data.reshape(-1, 1), xmesh) assert np.allclose(initial_data.sum(), 1) # Compute the type 2 Discrete Cosine Transform (DCT) of the data a = fftpack.dct(initial_data) # Compute the bandwidth I_sq = np.power(np.arange(1, n, dtype=FLOAT), 2) a2 = a[1:]**2 / 4 # Solve for the optimal (in the AMISE sense) t t_star = _root(_fixed_point, N, args=(N, I_sq, a2)) # The remainder of the algorithm computes the actual density # estimate, but this function is only used to compute the # bandwidth, since the bandwidth may be used for other kernels # apart from the Gaussian kernel # Smooth the initial data using the computed optimal t # Multiplication in frequency domain is convolution # integers = np.arange(n, dtype=np.float) # a_t = a * np.exp(-integers**2 * np.pi ** 2 * t_star / 2) # Diving by 2 done because of the implementation of fftpack.idct # density = fftpack.idct(a_t) / (2 * R) # Due to overflow, some values might be smaller than zero, correct it # density[density < 0] = 0. bandwidth = np.sqrt(t_star) * R return bandwidth