def test_density(self): # Check that the integral of the density equals 1. n = 100 v = np.random.rand(n) a, b = histogram(v, density=True) area = np.sum(a * np.diff(b)) assert_almost_equal(area, 1) # Check with non-constant bin widths v = np.arange(10) bins = [0, 1, 3, 6, 10] a, b = histogram(v, bins, density=True) assert_array_equal(a, .1) assert_equal(np.sum(a * np.diff(b)), 1) # Test that passing False works too a, b = histogram(v, bins, density=False) assert_array_equal(a, [1, 2, 3, 4]) # Variale bin widths are especially useful to deal with # infinities. v = np.arange(10) bins = [0, 1, 3, 6, np.inf] a, b = histogram(v, bins, density=True) assert_array_equal(a, [.1, .1, .1, 0.]) # Taken from a bug report from N. Becker on the numpy-discussion # mailing list Aug. 6, 2010. counts, dmy = np.histogram([1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) assert_equal(counts, [.25, 0])
def _unique1d(ar, return_index=False, return_inverse=False, return_counts=False): """ Find the unique elements of an array, ignoring shape. """ ar = np.asanyarray(ar).flatten() optional_indices = return_index or return_inverse if optional_indices: perm = ar.argsort(kind='mergesort' if return_index else 'quicksort') aux = ar[perm] else: ar.sort() aux = ar mask = np.empty(aux.shape, dtype=np.bool_) mask[:1] = True mask[1:] = aux[1:] != aux[:-1] ret = (aux[mask], ) if return_index: ret += (perm[mask], ) if return_inverse: imask = np.cumsum(mask) - 1 inv_idx = np.empty(mask.shape, dtype=np.intp) inv_idx[perm] = imask ret += (inv_idx, ) if return_counts: idx = np.concatenate(np.nonzero(mask) + ([mask.size], )) ret += (np.diff(idx), ) return ret
def test_normed(self): sup = suppress_warnings() with sup: rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*') # Check that the integral of the density equals 1. n = 100 v = np.random.rand(n) a, b = histogram(v, normed=True) area = np.sum(a * np.diff(b)) assert_almost_equal(area, 1) assert_equal(len(rec), 1) sup = suppress_warnings() with sup: rec = sup.record(np.VisibleDeprecationWarning, '.*normed.*') # Check with non-constant bin widths (buggy but backwards # compatible) v = np.arange(10) bins = [0, 1, 5, 9, 10] a, b = histogram(v, bins, normed=True) area = np.sum(a * np.diff(b)) assert_almost_equal(area, 1) assert_equal(len(rec), 1)
def test_outliers(self): # Check that outliers are not tallied a = np.arange(10) + .5 # Lower outliers h, b = histogram(a, range=[0, 9]) assert_equal(h.sum(), 9) # Upper outliers h, b = histogram(a, range=[1, 10]) assert_equal(h.sum(), 9) # Normalization h, b = histogram(a, range=[1, 9], density=True) assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15) # Weights w = np.arange(10) + .5 h, b = histogram(a, range=[1, 9], weights=w, density=True) assert_equal((h * np.diff(b)).sum(), 1) h, b = histogram(a, bins=8, range=[1, 9], weights=w) assert_equal(h, w[1:-1])
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None): """ Compute the multidimensional histogram of some data. Parameters ---------- sample : (N, D) array, or (D, N) array_like The data to be histogrammed. Note the unusual interpretation of sample when an array_like: * When an array, each row is a coordinate in a D-dimensional space - such as ``histogramgramdd(np.array([p1, p2, p3]))``. * When an array_like, each element is the list of values for single coordinate - such as ``histogramgramdd((X, Y, Z))``. The first form should be preferred. bins : sequence or int, optional The bin specification: * A sequence of arrays describing the bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. An entry of None in the sequence results in the minimum and maximum values being used for the corresponding dimension. The default, None, is equivalent to passing a tuple of D None values. density : bool, optional If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. normed : bool, optional An alias for the density argument that behaves identically. To avoid confusion with the broken normed argument to `histogram`, `density` should be preferred. weights : (N,) array_like, optional An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. Weights are normalized to 1 if normed is True. If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to the samples falling into each bin. Returns ------- H : ndarray The multidimensional histogram of sample x. See normed and weights for the different possible semantics. edges : list A list of D arrays describing the bin edges for each dimension. See Also -------- histogram: 1-D histogram histogram2d: 2-D histogram Examples -------- >>> r = np.random.randn(100,3) >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) >>> H.shape, edges[0].size, edges[1].size, edges[2].size ((5, 8, 4), 6, 9, 5) """ try: # Sample is an ND-array. N, D = sample.shape except (AttributeError, ValueError): # Sample is a sequence of 1D arrays. sample = np.atleast_2d(sample).T N, D = sample.shape nbin = np.empty(D, int) edges = D * [None] dedges = D * [None] if weights is not None: weights = np.asarray(weights) try: M = len(bins) if M != D: raise ValueError( 'The dimension of bins must be equal to the dimension of the ' ' sample x.') except TypeError: # bins is an integer bins = D * [bins] # normalize the range argument if range is None: range = (None, ) * D elif len(range) != D: raise ValueError('range argument must have one entry per dimension') # Create edge arrays for i in _range(D): if np.ndim(bins[i]) == 0: if bins[i] < 1: raise ValueError( '`bins[{}]` must be positive, when an integer'.format(i)) smin, smax = _get_outer_edges(sample[:, i], range[i]) edges[i] = np.linspace(smin, smax, bins[i] + 1) elif np.ndim(bins[i]) == 1: edges[i] = np.asarray(bins[i]) if np.any(edges[i][:-1] > edges[i][1:]): raise ValueError( '`bins[{}]` must be monotonically increasing, when an array' .format(i)) else: raise ValueError( '`bins[{}]` must be a scalar or 1d array'.format(i)) nbin[i] = len(edges[i]) + 1 # includes an outlier on each end dedges[i] = np.diff(edges[i]) # Compute the bin number each sample falls into. Ncount = tuple( # avoid np.digitize to work around gh-11022 np.searchsorted(edges[i], sample[:, i], side='right') for i in _range(D)) # Using digitize, values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right edge to be # counted in the last bin, and not as an outlier. for i in _range(D): # Find which points are on the rightmost edge. on_edge = (sample[:, i] == edges[i][-1]) # Shift these points one bin to the left. Ncount[i][on_edge] -= 1 # Compute the sample indices in the flattened histogram matrix. # This raises an error if the array is too large. xy = np.ravel_multi_index(Ncount, nbin) # Compute the number of repetitions in xy and assign it to the # flattened histmat. hist = np.bincount(xy, weights, minlength=nbin.prod()) # Shape into a proper matrix hist = hist.reshape(nbin) # This preserves the (bad) behavior observed in gh-7845, for now. hist = hist.astype(float, casting='safe') # Remove outliers (indices 0 and -1 for each dimension). core = D * (slice(1, -1), ) hist = hist[core] # handle the aliasing normed argument if normed is None: if density is None: density = False elif density is None: # an explicit normed argument was passed, alias it to the new name density = normed else: raise TypeError("Cannot specify both 'normed' and 'density'") if density: # calculate the probability density function s = hist.sum() for i in _range(D): shape = np.ones(D, int) shape[i] = nbin[i] - 2 hist = hist / dedges[i].reshape(shape) hist /= s if (hist.shape != nbin - 2).any(): raise RuntimeError("Internal Shape Error") return hist, edges
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None): r""" Compute the histogram of a set of data. Parameters ---------- a : array_like Input data. The histogram is computed over the flattened array. bins : int or sequence of scalars or str, optional If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. .. versionadded:: 1.11.0 If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by `histogram_bin_edges`. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. normed : bool, optional .. deprecated:: 1.6.0 This is equivalent to the `density` argument, but produces incorrect results for unequal bin widths. It should not be used. .. versionchanged:: 1.15.0 DeprecationWarnings are actually emitted. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. Overrides the ``normed`` keyword if given. Returns ------- hist : array The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- histogramdd, bincount, searchsorted, digitize, histogram_bin_edges Notes ----- All but the last (righthand-most) bin is half-open. In other words, if `bins` is:: [1, 2, 3, 4] then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. Examples -------- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) (array([0, 2, 1]), array([0, 1, 2, 3])) >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) (array([1, 4, 1]), array([0, 1, 2, 3])) >>> a = np.arange(5) >>> hist, bin_edges = np.histogram(a, density=True) >>> hist array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) >>> hist.sum() 2.4999999999999996 >>> np.sum(hist * np.diff(bin_edges)) 1.0 .. versionadded:: 1.11.0 Automated Bin Selection Methods example, using 2 peak random data with 2000 points: >>> import matplotlib.pyplot as plt >>> rng = np.random.RandomState(10) # deterministic random data >>> a = np.hstack((rng.normal(size=1000), ... rng.normal(loc=5, scale=2, size=1000))) >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram >>> plt.title("Histogram with 'auto' bins") >>> plt.show() """ a, weights = _ravel_and_check_weights(a, weights) bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights) # Histogram is an integer or a float array depending on the weights. if weights is None: ntype = np.dtype(np.intp) else: ntype = weights.dtype # We set a block size, as this allows us to iterate over chunks when # computing histograms, to minimize memory usage. BLOCK = 65536 # The fast path uses bincount, but that only works for certain types # of weight simple_weights = (weights is None or np.can_cast(weights.dtype, np.double) or np.can_cast(weights.dtype, complex)) if uniform_bins is not None and simple_weights: # Fast algorithm for equal bins # We now convert values of a to bin indices, under the assumption of # equal bin widths (which is valid here). first_edge, last_edge, n_equal_bins = uniform_bins # Initialize empty histogram n = np.zeros(n_equal_bins, ntype) # Pre-compute histogram scaling factor norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge) # We iterate over blocks here for two reasons: the first is that for # large arrays, it is actually faster (for example for a 10^8 array it # is 2x as fast) and it results in a memory footprint 3x lower in the # limit of large arrays. for i in _range(0, len(a), BLOCK): tmp_a = a[i:i + BLOCK] if weights is None: tmp_w = None else: tmp_w = weights[i:i + BLOCK] # Only include values in the right range keep = (tmp_a >= first_edge) keep &= (tmp_a <= last_edge) if not np.logical_and.reduce(keep): tmp_a = tmp_a[keep] if tmp_w is not None: tmp_w = tmp_w[keep] # This cast ensures no type promotions occur below, which gh-10322 # make unpredictable. Getting it wrong leads to precision errors # like gh-8123. tmp_a = tmp_a.astype(bin_edges.dtype, copy=False) # Compute the bin indices, and for values that lie exactly on # last_edge we need to subtract one f_indices = _unsigned_subtract(tmp_a, first_edge) * norm indices = f_indices.astype(np.intp) indices[indices == n_equal_bins] -= 1 # The index computation is not guaranteed to give exactly # consistent results within ~1 ULP of the bin edges. decrement = tmp_a < bin_edges[indices] indices[decrement] -= 1 # The last bin includes the right edge. The other bins do not. increment = ((tmp_a >= bin_edges[indices + 1]) & (indices != n_equal_bins - 1)) indices[increment] += 1 # We now compute the histogram using bincount if ntype.kind == 'c': n.real += np.bincount(indices, weights=tmp_w.real, minlength=n_equal_bins) n.imag += np.bincount(indices, weights=tmp_w.imag, minlength=n_equal_bins) else: n += np.bincount(indices, weights=tmp_w, minlength=n_equal_bins).astype(ntype) else: # Compute via cumulative histogram cum_n = np.zeros(bin_edges.shape, ntype) if weights is None: for i in _range(0, len(a), BLOCK): sa = np.sort(a[i:i + BLOCK]) cum_n += _search_sorted_inclusive(sa, bin_edges) else: zero = np.zeros(1, dtype=ntype) for i in _range(0, len(a), BLOCK): tmp_a = a[i:i + BLOCK] tmp_w = weights[i:i + BLOCK] sorting_index = np.argsort(tmp_a) sa = tmp_a[sorting_index] sw = tmp_w[sorting_index] cw = np.concatenate((zero, sw.cumsum())) bin_index = _search_sorted_inclusive(sa, bin_edges) cum_n += cw[bin_index] n = np.diff(cum_n) # density overrides the normed keyword if density is not None: if normed is not None: # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) warnings.warn( "The normed argument is ignored when density is provided. " "In future passing both will result in an error.", DeprecationWarning, stacklevel=2) normed = None if density: db = np.array(np.diff(bin_edges), float) return n / db / n.sum(), bin_edges elif normed: # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) warnings.warn( "Passing `normed=True` on non-uniform bins has always been " "broken, and computes neither the probability density " "function nor the probability mass function. " "The result is only correct if the bins are uniform, when " "density=True will produce the same result anyway. " "The argument will be removed in a future version of " "numpy.", np.VisibleDeprecationWarning, stacklevel=2) # this normalization is incorrect, but db = np.array(np.diff(bin_edges), float) return n / (n * db).sum(), bin_edges else: if normed is not None: # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) warnings.warn( "Passing normed=False is deprecated, and has no effect. " "Consider passing the density argument instead.", DeprecationWarning, stacklevel=2) return n, bin_edges