示例#1
0
文件: test_ndim.py 项目: zhaohb/cupy
    def test_ndim_array_function(self):
        a = cupy.ones((4, 4))
        assert numpy.ndim(a) == 2

        a = cupy.asarray(5)
        assert numpy.ndim(a) == 0

        a = numpy.ones((4, 4))
        assert cupy.ndim(a) == 2

        a = numpy.asarray(5)
        assert cupy.ndim(a) == 0
示例#2
0
def tensordot_adjoint_0(B, G, axes, A_ndim, B_ndim):
    # The adjoint of the operator
    # A |--> np.tensordot(A, B, axes)
    if B_ndim == 0:
        return G * B

    G_axes = ocp.arange(ocp.ndim(G))
    if type(axes) is int:
        axes = max(axes, 0)
        B_axes = ocp.arange(B_ndim)
        return ocp.tensordot(G, B, [G_axes[A_ndim - axes:], B_axes[axes:]])

    elif type(axes[0]) is int:
        axes = [axes[0] % A_ndim, axes[1] % B_ndim]
        B_axes = ocp.arange(B_ndim)
        return ocp.tensordot(
            G, B, [G_axes[A_ndim - 1:],
                   ocp.delete(B_axes, axes[1])])  # noqa: E501

    else:
        A_axes = ocp.arange(A_ndim)
        B_axes = ocp.arange(B_ndim)
        summed_axes = [
            ocp.asarray(axes[0]) % A_ndim,
            ocp.asarray(axes[1]) % B_ndim,
        ]  # noqa: E501
        other_axes = [
            ocp.delete(A_axes, summed_axes[0]),
            ocp.delete(B_axes, summed_axes[1]),  # noqa: E501
        ]
        out = ocp.tensordot(G, B, [G_axes[len(other_axes[0]):], other_axes[1]])
        perm = ocp.argsort(
            ocp.concatenate(
                (other_axes[0], summed_axes[0][ocp.argsort(summed_axes[1])])))
        return ocp.transpose(out, perm)
示例#3
0
    def explain(self, x, scaled=True):
        """
        Return explanation of the anomalies based on t-scores.
        """
        if cp.ndim(x) < 2:
            x = x.reshape(1, -1)
        ranked_feature_importance = cp.zeros([x.shape[1], 1])

        for feature in range(x.shape[1]):
            # find all projections without the feature j and with feature j
            index_selected_feature = cp.where(
                self.projections[:, feature] != 0)[0]
            index_not_selected_feature = cp.where(
                self.projections[:, feature] == 0)[0]
            scores_with_feature = self.instance_score(x,
                                                      index_selected_feature)
            scores_without_feature = self.instance_score(
                x, index_not_selected_feature)
            ranked_feature_importance[feature, 0] = self.t_test(
                scores_with_feature, scores_without_feature)

        if scaled:
            assert cp.max(ranked_feature_importance) != cp.min(
                ranked_feature_importance)
            normalized_score = (ranked_feature_importance - cp.min(
                ranked_feature_importance)) / (
                cp.max(ranked_feature_importance) - cp.min(
                    ranked_feature_importance))
            return normalized_score
        else:
            return ranked_feature_importance
示例#4
0
    def score(self, input_data):
        """
        Calculate anomaly scores using negative likelihood across n_random_cuts histograms.

        :param input_data: NxD training sample
        :type input_data: cupy.ndarray

        Examples
        --------
        >>> from clx.analytics.loda import Loda
        >>> import cupy as cp
        >>> x = cp.random.randn(100,5) # 5-D multivariate synthetic dataset
        >>> loda_ad = Loda(n_bins=None, n_random_cuts=100)
        >>> loda_ad.fit(x)
        >>> loda_ad.score(x)
        array([0.04295848, 0.02853553, 0.04587308, 0.03750692, 0.05050418,
        0.02671958, 0.03538646, 0.05606504, 0.03418612, 0.04040502,
        0.03542846, 0.02801463, 0.04884918, 0.02943411, 0.02741364,
        0.02702433, 0.03064191, 0.02575712, 0.03957355, 0.02729784,
        ...
        0.03943715, 0.02701243, 0.02880341, 0.04086408, 0.04365477])
        """
        if cp.ndim(input_data) < 2:
            input_data = input_data.reshape(1, -1)
        pred_scores = cp.zeros([input_data.shape[0], 1])
        for i in range(self._n_random_cuts):
            projected_data = self._projections[i, :].dot(input_data.T)
            inds = cp.searchsorted(self._limits[i, :self._n_bins - 1],
                                   projected_data,
                                   side='left')
            pred_scores[:, 0] += -self._weights[i] * cp.log(
                self._histograms[i, inds])
        pred_scores /= self._n_random_cuts
        return pred_scores.ravel()
示例#5
0
 def score(self, X):
     if cp.ndim(X) < 2:
         X = X.reshape(1, -1)
     pred_scores = cp.zeros([X.shape[0], 1])
     for i in range(self.n_random_cuts):
         projected_data = self.projections[i, :].dot(X.T)
         inds = cp.searchsorted(self.limits[i, :self.n_bins - 1],
                                projected_data, side='left')
         pred_scores[:, 0] += -self.weights[i] * cp.log(
             self.histograms[i, inds])
     pred_scores /= self.n_random_cuts
     return pred_scores.ravel()
示例#6
0
 def instance_score(self, x, projection_index):
     """
         Return scores from selected projection index.
         x (cupy.ndarray) : D x 1 feature instance.
     """
     if cp.ndim(x) < 2:
         x = x.reshape(1, -1)
     pred_scores = cp.zeros([x.shape[0], len(projection_index)])
     for i in projection_index:
         projected_data = self.projections[i, :].dot(x.T)
         inds = cp.searchsorted(self.limits[i, :self.n_bins - 1],
                                projected_data, side='left')
         pred_scores[:, i] = -self.weights[i] * cp.log(
             self.histograms[i, inds])
     return pred_scores
示例#7
0
def time_bin_with_mask(data, time_bin_length, mask=None):
    """Returns time binned data where only about non-masked values is averaged.

    Parameters
    ----------
    data : array
        Data array of shape (time, variables).

    time_bin_length : int
        Length of time bin.

    mask : bool array, optional (default: None)
        Data mask where True labels masked samples.

    Returns
    -------
    (bindata, T) : tuple of array and int
        Tuple of time-binned data array and new length of array.
    """

    T = len(data)

    time_bin_length = int(time_bin_length)

    if mask is None:
        sample_selector = np.ones(data.shape)
    else:
        # Invert mask
        sample_selector = (mask == False)

    if np.ndim(data) == 1.:
        data.shape = (T, 1)
        mask.shape = (T, 1)

    bindata = np.zeros((T // time_bin_length, ) + data.shape[1:],
                       dtype="float32")
    for index, i in enumerate(
            range(0, T - time_bin_length + 1, time_bin_length)):
        # print weighted_avg_and_std(fulldata[i:i+time_bin_length], axis=0,
        # weights=sample_selector[i:i+time_bin_length])[0]
        bindata[index] = weighted_avg_and_std(
            data[i:i + time_bin_length],
            axis=0,
            weights=sample_selector[i:i + time_bin_length])[0]

    T, grid_size = bindata.shape

    return (bindata.squeeze(), T)
示例#8
0
    def explain(self, anomaly, scaled=True):
        """
        Explain anomaly based on contributions (t-scores) of each feature across histograms.

        :param anomaly: selected anomaly from input dataset
        :type anomaly: cupy.ndarray
        :param scaled: set to scale output feature importance scores
        :type scaled: boolean

        Examples
        --------
        >>> loda_ad.explain(x[5]) # x[5] is found anomaly
        array([[1.        ],
        [0.        ],
        [0.69850349],
        [0.91081035],
        [0.78774349]])
        """
        if cp.ndim(anomaly) < 2:
            anomaly = anomaly.reshape(1, -1)
        ranked_feature_importance = cp.zeros([anomaly.shape[1], 1])

        for feature in range(anomaly.shape[1]):
            # find all projections without the feature j and with feature j
            index_selected_feature = cp.where(
                self._projections[:, feature] != 0)[0]
            index_not_selected_feature = cp.where(
                self._projections[:, feature] == 0)[0]
            scores_with_feature = self._instance_score(anomaly,
                                                       index_selected_feature)
            scores_without_feature = self._instance_score(
                anomaly, index_not_selected_feature)
            ranked_feature_importance[feature, 0] = self._t_test(
                scores_with_feature, scores_without_feature)

        if scaled:
            assert cp.max(ranked_feature_importance) != cp.min(
                ranked_feature_importance)
            normalized_score = (ranked_feature_importance -
                                cp.min(ranked_feature_importance)) / (
                                    cp.max(ranked_feature_importance) -
                                    cp.min(ranked_feature_importance))
            return normalized_score
        else:
            return ranked_feature_importance
示例#9
0
def lowhighpass_filter(data, cutperiod, pass_periods='low'):
    """Butterworth low- or high pass filter.

    This function applies a linear filter twice, once forward and once
    backwards. The combined filter has linear phase.

    Parameters
    ----------
    data : array
        Data array of shape (time, variables).

    cutperiod : int
        Period of cutoff.

    pass_periods : str, optional (default: 'low')
        Either 'low' or 'high' to act as a low- or high-pass filter

    Returns
    -------
    data : array
        Filtered data array.
    """
    try:
        from scipy.signal import butter, filtfilt
    except:
        print('Could not import scipy.signal for butterworth filtering!')

    fs = 1.
    order = 3
    ws = 1. / cutperiod / (0.5 * fs)
    b, a = butter(order, ws, pass_periods)
    if np.ndim(data) == 1:
        data = filtfilt(b, a, data)
    else:
        for i in range(data.shape[1]):
            data[:, i] = filtfilt(b, a, data[:, i])

    return data
示例#10
0
def tensordot_adjoint_1(A, G, axes, A_ndim, B_ndim):
    # The adjoint of the operator
    # B |--> np.tensordot(A, B, axes)
    if A_ndim == 0:
        return G * A

    G_axes = ocp.arange(ocp.ndim(G))
    if type(axes) is int:
        axes = max(axes, 0)
        A_axes = ocp.arange(A_ndim)
        return ocp.tensordot(
            A, G,
            [A_axes[:A_ndim - axes], G_axes[:A_ndim - axes]])  # noqa: E501

    elif type(axes[0]) is int:
        axes = [axes[0] % A_ndim, axes[1] % B_ndim]
        A_axes = ocp.arange(A_ndim)
        return ocp.tensordot(
            A, G,
            [ocp.delete(A_axes, axes[0]), G_axes[:A_ndim - 1]])  # noqa: E501

    else:
        A_axes = ocp.arange(A_ndim)
        B_axes = ocp.arange(B_ndim)
        summed_axes = [
            ocp.asarray(axes[0]) % A_ndim,
            ocp.asarray(axes[1]) % B_ndim,
        ]  # noqa: E501
        other_axes = [
            ocp.delete(A_axes, summed_axes[0]),
            ocp.delete(B_axes, summed_axes[1]),  # noqa: E501
        ]
        out = ocp.tensordot(A, G, [other_axes[0], G_axes[:len(other_axes[0])]])
        perm = ocp.argsort(
            ocp.concatenate(
                (summed_axes[1][ocp.argsort(summed_axes[0])], other_axes[1])))
        return ocp.transpose(out, perm)
示例#11
0
def ordinal_patt_array(array,
                       array_mask=None,
                       dim=2,
                       step=1,
                       weights=False,
                       verbosity=0):
    """Returns symbolified array of ordinal patterns.

    Each data vector (X_t, ..., X_t+(dim-1)*step) is converted to its rank
    vector. E.g., (0.2, -.6, 1.2) --> (1,0,2) which is then assigned to a
    unique integer (see Article). There are faculty(dim) possible rank vectors.

    Note that the symb_array is step*(dim-1) shorter than the original array!

    Reference: B. Pompe and J. Runge (2011). Momentary information transfer as
    a coupling measure of time series. Phys. Rev. E, 83(5), 1-12.
    doi:10.1103/PhysRevE.83.051122

    Parameters
    ----------
    array : array-like
        Data array of shape (time, variables).

    array_mask : bool array
        Data mask where True labels masked samples.

    dim : int, optional (default: 2)
        Pattern dimension

    step : int, optional (default: 1)
        Delay of pattern embedding vector.

    weights : bool, optional (default: False)
        Whether to return array of variances of embedding vectors as weights.

    verbosity : int, optional (default: 0)
        Level of verbosity.

    Returns
    -------
    patt, patt_mask [, patt_time] : tuple of arrays
        Tuple of converted pattern array and new length
    """
    from scipy.misc import factorial

    # Import cython code
    try:
        import tigramite.tigramite_cython_code as tigramite_cython_code
    except ImportError:
        raise ImportError("Could not import tigramite_cython_code, please"
                          " compile cython code first as described in Readme.")

    array = array.astype('float64')

    if array_mask is not None:
        assert array_mask.dtype == 'int32'
    else:
        array_mask = np.zeros(array.shape, dtype='int32')

    if np.ndim(array) == 1:
        T = len(array)
        array = array.reshape(T, 1)
        array_mask = array_mask.reshape(T, 1)

    # Add noise to destroy ties...
    array += (1E-6 * array.std(axis=0) *
              np.random.rand(array.shape[0], array.shape[1]).astype('float64'))

    patt_time = int(array.shape[0] - step * (dim - 1))
    T, N = array.shape

    if dim <= 1 or patt_time <= 0:
        raise ValueError("Dim mist be > 1 and length of delay vector smaller "
                         "array length.")

    patt = np.zeros((patt_time, N), dtype='int32')
    weights_array = np.zeros((patt_time, N), dtype='float64')

    patt_mask = np.zeros((patt_time, N), dtype='int32')

    # Precompute factorial for c-code... patterns of dimension
    # larger than 10 are not supported
    fac = factorial(np.arange(10)).astype('int32')

    # _get_patterns_cython assumes mask=0 to be a masked value
    array_mask = (array_mask == False).astype('int32')

    (patt, patt_mask, weights_array) = \
            tigramite_cython_code._get_patterns_cython(array, array_mask,
                                                       patt, patt_mask,
                                                       weights_array, dim,
                                                       step, fac, N, T)

    weights_array = np.asarray(weights_array)
    patt = np.asarray(patt)
    # Transform back to mask=1 implying a masked value
    patt_mask = np.asarray(patt_mask) == False

    if weights:
        return (patt, patt_mask, patt_time, weights_array)
    else:
        return (patt, patt_mask, patt_time)
示例#12
0
def smooth(data, smooth_width, kernel='gaussian', mask=None, residuals=False):
    """Returns either smoothed time series or its residuals.

    the difference between the original and the smoothed time series
    (=residuals) of a kernel smoothing with gaussian (smoothing kernel width =
    twice the sigma!) or heaviside window, equivalent to a running mean.

    Assumes data of shape (T, N) or (T,)
    :rtype: array
    :returns: smoothed/residual data

    Parameters
    ----------
    data : array
        Data array of shape (time, variables).

    smooth_width : float
        Window width of smoothing, 2*sigma for a gaussian.

    kernel : str, optional (default: 'gaussian')
        Smoothing kernel, 'gaussian' or 'heaviside' for a running mean.

    mask : bool array, optional (default: None)
        Data mask where True labels masked samples.

    residuals : bool, optional (default: False)
        True if residuals should be returned instead of smoothed data.

    Returns
    -------
    data : array-like
        Smoothed/residual data.
    """

    print("%s %s smoothing with " % ({
        True: "Take residuals of a ",
        False: ""
    }[residuals], kernel) + "window width %.2f (2*sigma for a gaussian!)" %
          (smooth_width))

    totaltime = len(data)
    if kernel == 'gaussian':
        window = np.exp(-(np.arange(totaltime).reshape(
            (1, totaltime)) - np.arange(totaltime).reshape(
                (totaltime, 1)))**2 / ((2. * smooth_width / 2.)**2))
    elif kernel == 'heaviside':
        import scipy.linalg
        wtmp = np.zeros(totaltime)
        wtmp[:np.ceil(smooth_width / 2.)] = 1
        window = scipy.linalg.toeplitz(wtmp)

    if mask is None:
        if np.ndim(data) == 1:
            smoothed_data = (data * window).sum(axis=1) / window.sum(axis=1)
        else:
            smoothed_data = np.zeros(data.shape)
            for i in range(data.shape[1]):
                smoothed_data[:, i] = (data[:, i] *
                                       window).sum(axis=1) / window.sum(axis=1)
    else:
        if np.ndim(data) == 1:
            smoothed_data = ((data * window * (mask == False)).sum(axis=1) /
                             (window * (mask == False)).sum(axis=1))
        else:
            smoothed_data = np.zeros(data.shape)
            for i in range(data.shape[1]):
                smoothed_data[:, i] = ((data[:, i] * window *
                                        (mask == False)[:, i]).sum(axis=1) /
                                       (window *
                                        (mask == False)[:, i]).sum(axis=1))

    if residuals:
        return data - smoothed_data
    else:
        return smoothed_data
示例#13
0
def gradient(f, *varargs, axis=None, edge_order=1):
    """Return the gradient of an N-dimensional array.

    The gradient is computed using second order accurate central differences
    in the interior points and either first or second order accurate one-sides
    (forward or backwards) differences at the boundaries.
    The returned gradient hence has the same shape as the input array.

    Args:
        f (cupy.ndarray): An N-dimensional array containing samples of a scalar
            function.
        varargs (list of scalar or array, optional): Spacing between f values.
            Default unitary spacing for all dimensions. Spacing can be
            specified using:

            1. single scalar to specify a sample distance for all dimensions.
            2. N scalars to specify a constant sample distance for each
               dimension. i.e. `dx`, `dy`, `dz`, ...
            3. N arrays to specify the coordinates of the values along each
               dimension of F. The length of the array must match the size of
               the corresponding dimension
            4. Any combination of N scalars/arrays with the meaning of 2. and
               3.

            If `axis` is given, the number of varargs must equal the number of
            axes. Default: 1.
        edge_order ({1, 2}, optional): The gradient is calculated using N-th
            order accurate differences at the boundaries. Default: 1.
        axis (None or int or tuple of ints, optional): The gradient is
            calculated only along the given axis or axes. The default
            (axis = None) is to calculate the gradient for all the axes of the
            input array. axis may be negative, in which case it counts from the
            last to the first axis.

    Returns:
        gradient (cupy.ndarray or list of cupy.ndarray): A set of ndarrays
        (or a single ndarray if there is only one dimension) corresponding
        to the derivatives of f with respect to each dimension. Each
        derivative has the same shape as f.

    .. seealso:: :func:`numpy.gradient`
    """
    f = cupy.asanyarray(f)
    ndim = f.ndim  # number of dimensions
    axes = internal._normalize_axis_indices(axis, ndim, sort_axes=False)

    len_axes = len(axes)
    n = len(varargs)
    if n == 0:
        # no spacing argument - use 1 in all axes
        dx = [1.0] * len_axes
    elif n == 1 and cupy.ndim(varargs[0]) == 0:
        # single scalar for all axes
        dx = varargs * len_axes
    elif n == len_axes:
        # scalar or 1d array for each axis
        dx = list(varargs)
        for i, distances in enumerate(dx):
            if cupy.ndim(distances) == 0:
                continue
            elif cupy.ndim(distances) != 1:
                raise ValueError("distances must be either scalars or 1d")
            if len(distances) != f.shape[axes[i]]:
                raise ValueError("when 1d, distances must match "
                                 "the length of the corresponding dimension")
            if numpy.issubdtype(distances.dtype, numpy.integer):
                # Convert numpy integer types to float64 to avoid modular
                # arithmetic in np.diff(distances).
                distances = distances.astype(numpy.float64)
            diffx = cupy.diff(distances)
            # if distances are constant reduce to the scalar case
            # since it brings a consistent speedup
            if (diffx == diffx[0]).all():  # synchronize
                diffx = diffx[0]
            dx[i] = diffx
    else:
        raise TypeError("invalid number of arguments")

    if edge_order > 2:
        raise ValueError("'edge_order' greater than 2 not supported")

    # use central differences on interior and one-sided differences on the
    # endpoints. This preserves second order-accuracy over the full domain.

    outvals = []

    # create slice objects --- initially all are [:, :, ..., :]
    slice1 = [slice(None)] * ndim
    slice2 = [slice(None)] * ndim
    slice3 = [slice(None)] * ndim
    slice4 = [slice(None)] * ndim

    otype = f.dtype
    if numpy.issubdtype(otype, numpy.inexact):
        pass
    else:
        # All other types convert to floating point.
        # First check if f is a numpy integer type; if so, convert f to float64
        # to avoid modular arithmetic when computing the changes in f.
        if numpy.issubdtype(otype, numpy.integer):
            f = f.astype(numpy.float64)
        otype = numpy.float64

    for axis, ax_dx in zip(axes, dx):
        if f.shape[axis] < edge_order + 1:
            raise ValueError(
                "Shape of array too small to calculate a numerical gradient, "
                "at least (edge_order + 1) elements are required.")
        # result allocation
        out = cupy.empty_like(f, dtype=otype)

        # spacing for the current axis
        uniform_spacing = cupy.ndim(ax_dx) == 0

        # Numerical differentiation: 2nd order interior
        slice1[axis] = slice(1, -1)
        slice2[axis] = slice(None, -2)
        slice3[axis] = slice(1, -1)
        slice4[axis] = slice(2, None)

        if uniform_spacing:
            out[tuple(slice1)] = (f[tuple(slice4)] -
                                  f[tuple(slice2)]) / (2.0 * ax_dx)
        else:
            dx1 = ax_dx[0:-1]
            dx2 = ax_dx[1:]
            dx_sum = dx1 + dx2
            a = -(dx2) / (dx1 * dx_sum)
            b = (dx2 - dx1) / (dx1 * dx2)
            c = dx1 / (dx2 * dx_sum)
            # fix the shape for broadcasting
            shape = [1] * ndim
            shape[axis] = -1
            a.shape = b.shape = c.shape = tuple(shape)
            # 1D equivalent -- out[1:-1] = a * f[:-2] + b * f[1:-1] + c * f[2:]
            out[tuple(slice1)] = (a * f[tuple(slice2)] + b * f[tuple(slice3)] +
                                  c * f[tuple(slice4)])

        # Numerical differentiation: 1st order edges
        if edge_order == 1:
            slice1[axis] = 0
            slice2[axis] = 1
            slice3[axis] = 0
            dx_0 = ax_dx if uniform_spacing else ax_dx[0]
            # 1D equivalent -- out[0] = (f[1] - f[0]) / (x[1] - x[0])
            out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_0

            slice1[axis] = -1
            slice2[axis] = -1
            slice3[axis] = -2
            dx_n = ax_dx if uniform_spacing else ax_dx[-1]
            # 1D equivalent -- out[-1] = (f[-1] - f[-2]) / (x[-1] - x[-2])
            out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_n

        # Numerical differentiation: 2nd order edges
        else:
            slice1[axis] = 0
            slice2[axis] = 0
            slice3[axis] = 1
            slice4[axis] = 2
            if uniform_spacing:
                a = -1.5 / ax_dx
                b = 2.0 / ax_dx
                c = -0.5 / ax_dx
            else:
                dx1 = ax_dx[0]
                dx2 = ax_dx[1]
                dx_sum = dx1 + dx2
                a = -(2.0 * dx1 + dx2) / (dx1 * (dx_sum))
                b = dx_sum / (dx1 * dx2)
                c = -dx1 / (dx2 * (dx_sum))
            # 1D equivalent -- out[0] = a * f[0] + b * f[1] + c * f[2]
            out[tuple(slice1)] = (a * f[tuple(slice2)] + b * f[tuple(slice3)] +
                                  c * f[tuple(slice4)])

            slice1[axis] = -1
            slice2[axis] = -3
            slice3[axis] = -2
            slice4[axis] = -1
            if uniform_spacing:
                a = 0.5 / ax_dx
                b = -2.0 / ax_dx
                c = 1.5 / ax_dx
            else:
                dx1 = ax_dx[-2]
                dx2 = ax_dx[-1]
                dx_sum = dx1 + dx2
                a = (dx2) / (dx1 * (dx_sum))
                b = -dx_sum / (dx1 * dx2)
                c = (2.0 * dx2 + dx1) / (dx2 * (dx_sum))
            # 1D equivalent -- out[-1] = a * f[-3] + b * f[-2] + c * f[-1]
            out[tuple(slice1)] = (a * f[tuple(slice2)] + b * f[tuple(slice3)] +
                                  c * f[tuple(slice4)])
        outvals.append(out)

        # reset the slice object in this dimension to ":"
        slice1[axis] = slice(None)
        slice2[axis] = slice(None)
        slice3[axis] = slice(None)
        slice4[axis] = slice(None)

    if len_axes == 1:
        return outvals[0]
    else:
        return outvals
示例#14
0
文件: histogram.py 项目: zhaohb/cupy
def histogramdd(sample, bins=10, range=None, weights=None, density=False):
    """Compute the multidimensional histogram of some data.

    Args:
        sample (cupy.ndarray): The data to be histogrammed. (N, D) or (D, N)
            array

            Note the unusual interpretation of sample when an array_like:

            * When an array, each row is a coordinate in a D-dimensional
              space - such as ``histogramdd(cupy.array([p1, p2, p3]))``.
            * When an array_like, each element is the list of values for single
              coordinate - such as ``histogramdd((X, Y, Z))``.

            The first form should be preferred.
        bins (int or tuple of int or cupy.ndarray): The bin specification:

            * A sequence of arrays describing the monotonically increasing bin
              edges along each dimension.
            * The number of bins for each dimension (nx, ny, ... =bins)
            * The number of bins for all dimensions (nx=ny=...=bins).
        range (sequence, optional): A sequence of length D, each an optional
            (lower, upper) tuple giving the outer bin edges to be used if the
            edges are not given explicitly in `bins`. An entry of None in the
            sequence results in the minimum and maximum values being used for
            the corresponding dimension. The default, None, is equivalent to
            passing a tuple of D None values.
        weights (cupy.ndarray): An array of values `w_i` weighing each sample
            `(x_i, y_i, z_i, ...)`. The values of the returned histogram are
            equal to the sum of the weights belonging to the samples falling
            into each bin.
        density (bool, optional): If False, the default, returns the number of
            samples in each bin. If True, returns the probability *density*
            function at the bin, ``bin_count / sample_count / bin_volume``.

    Returns:
        H (cupy.ndarray): The multidimensional histogram of sample x. See
            normed and weights for the different possible semantics.
        edges (list of cupy.ndarray): A list of D arrays describing the bin
            edges for each dimension.

    .. warning::

        This function may synchronize the device.

    .. seealso:: :func:`numpy.histogramdd`
    """
    if isinstance(sample, cupy.ndarray):
        # Sample is an ND-array.
        if sample.ndim == 1:
            sample = sample[:, cupy.newaxis]
        nsamples, ndim = sample.shape
    else:
        sample = cupy.stack(sample, axis=-1)
        nsamples, ndim = sample.shape

    nbin = numpy.empty(ndim, int)
    edges = ndim * [None]
    dedges = ndim * [None]
    if weights is not None:
        weights = cupy.asarray(weights)

    try:
        nbins = len(bins)
        if nbins != ndim:
            raise ValueError(
                'The dimension of bins must be equal to the dimension of the '
                ' sample x.')
    except TypeError:
        # bins is an integer
        bins = ndim * [bins]

    # normalize the range argument
    if range is None:
        range = (None, ) * ndim
    elif len(range) != ndim:
        raise ValueError('range argument must have one entry per dimension')

    # Create edge arrays
    for i in _range(ndim):
        if cupy.ndim(bins[i]) == 0:
            if bins[i] < 1:
                raise ValueError(
                    '`bins[{}]` must be positive, when an integer'.format(i))
            smin, smax = _get_outer_edges(sample[:, i], range[i])
            num = int(bins[i] + 1)  # synchronize!
            edges[i] = cupy.linspace(smin, smax, num)
        elif cupy.ndim(bins[i]) == 1:
            if not isinstance(bins[i], cupy.ndarray):
                raise ValueError('array-like bins not supported')
            edges[i] = bins[i]
            if (edges[i][:-1] > edges[i][1:]).any():  # synchronize!
                raise ValueError(
                    '`bins[{}]` must be monotonically increasing, when an '
                    'array'.format(i))
        else:
            raise ValueError(
                '`bins[{}]` must be a scalar or 1d array'.format(i))

        nbin[i] = len(edges[i]) + 1  # includes an outlier on each end
        dedges[i] = cupy.diff(edges[i])

    # Compute the bin number each sample falls into.
    ncount = tuple(
        # avoid cupy.digitize to work around NumPy issue gh-11022
        cupy.searchsorted(edges[i], sample[:, i], side='right')
        for i in _range(ndim))

    # Using digitize, values that fall on an edge are put in the right bin.
    # For the rightmost bin, we want values equal to the right edge to be
    # counted in the last bin, and not as an outlier.
    for i in _range(ndim):
        # Find which points are on the rightmost edge.
        on_edge = sample[:, i] == edges[i][-1]
        # Shift these points one bin to the left.
        ncount[i][on_edge] -= 1

    # Compute the sample indices in the flattened histogram matrix.
    # This raises an error if the array is too large.
    xy = cupy.ravel_multi_index(ncount, nbin)

    # Compute the number of repetitions in xy and assign it to the
    # flattened histmat.
    hist = cupy.bincount(xy, weights, minlength=numpy.prod(nbin))

    # Shape into a proper matrix
    hist = hist.reshape(nbin)

    # This preserves the (bad) behavior observed in NumPy gh-7845, for now.
    hist = hist.astype(float)  # Note: NumPy uses casting='safe' here too

    # Remove outliers (indices 0 and -1 for each dimension).
    core = ndim * (slice(1, -1), )
    hist = hist[core]

    if density:
        # calculate the probability density function
        s = hist.sum()
        for i in _range(ndim):
            shape = [1] * ndim
            shape[i] = nbin[i] - 2
            hist = hist / dedges[i].reshape(shape)
        hist /= s

    if any(hist.shape != numpy.asarray(nbin) - 2):
        raise RuntimeError('Internal Shape Error')
    return hist, edges