예제 #1
0
def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others):
    set0 = set(sub0)
    set1 = set(sub1)
    assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal'
    assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal'

    if len(sub0) == 0 or len(sub1) == 0:
        return arr0 * arr1, sub0 + sub1

    set_others = set(sub_others)
    shared = set0 & set1
    batch_dims = shared & set_others
    contract_dims = shared - batch_dims

    bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims)
    bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims)

    sub_b = [sub0[axis] for axis in bs0]
    assert sub_b == [sub1[axis] for axis in bs1]
    sub_l = [sub0[axis] for axis in ts0]
    sub_r = [sub1[axis] for axis in ts1]

    sub_out = sub_b + sub_l + sub_r
    assert set(sub_out) <= set_others, 'operands should be reduced: unary sum'

    if len(contract_dims) == 0:
        # Use element-wise multiply when no contraction is needed
        if len(sub_out) == len(sub_others):
            # to assure final output of einsum is C-contiguous
            sub_out = sub_others
        arr0 = _expand_dims_transpose(arr0, sub0, sub_out)
        arr1 = _expand_dims_transpose(arr1, sub1, sub_out)
        return arr0 * arr1, sub_out

    for accelerator in _accelerator.get_routine_accelerators():
        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
            if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims,
                             contract_dims):
                if len(sub_out) == len(sub_others):
                    # to assure final output of einsum is C-contiguous
                    sub_out = sub_others
                out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1,
                                           sub_out)
                arr_out = cupy.empty(out_shape, arr0.dtype)
                arr0 = cupy.ascontiguousarray(arr0)
                arr1 = cupy.ascontiguousarray(arr1)
                desc_0 = cutensor.create_tensor_descriptor(arr0)
                desc_1 = cutensor.create_tensor_descriptor(arr1)
                desc_out = cutensor.create_tensor_descriptor(arr_out)
                arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1,
                                               desc_1, sub1, 0.0, arr_out,
                                               desc_out, sub_out)
                return arr_out, sub_out

    tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0])
    tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1])
    shapes_out = shapes0[0] + shapes0[1] + shapes1[2]
    assert shapes0[0] == shapes1[0]
    arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out)
    return arr_out, sub_out
예제 #2
0
 def with_accelerators(self):
     old_accelerators = _accelerator.get_routine_accelerators()
     if self.enable_cub:
         _accelerator.set_routine_accelerators(['cub'])
     else:
         _accelerator.set_routine_accelerators([])
     yield
     _accelerator.set_routine_accelerators(old_accelerators)
예제 #3
0
 def setUp(self):
     self.old_routine_accelerators = _acc.get_routine_accelerators()
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     if self.backend == 'device':
         _acc.set_routine_accelerators(['cub'])
         _acc.set_reduction_accelerators([])
     elif self.backend == 'block':
         _acc.set_routine_accelerators([])
         _acc.set_reduction_accelerators(['cub'])
예제 #4
0
    def setUp(self):
        cupy.core._optimize_config._clear_all_contexts_cache()
        self.old_reductions = _accelerator.get_reduction_accelerators()
        _accelerator.set_reduction_accelerators(self.backend)

        # avoid shadowed by the cub module
        self.old_routines = _accelerator.get_routine_accelerators()
        _accelerator.set_routine_accelerators([])

        self.x = testing.shaped_arange((3, 4), cupy, dtype=cupy.float32)
예제 #5
0
파일: test_search.py 프로젝트: wphicks/cupy
 def setUp(self):
     self.order, self.axis = self.order_and_axis
     self.old_routine_accelerators = _acc.get_routine_accelerators()
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     if self.backend == 'device':
         if self.axis is not None:
             raise unittest.SkipTest('does not support')
         _acc.set_routine_accelerators(['cub'])
         _acc.set_reduction_accelerators([])
     elif self.backend == 'block':
         _acc.set_routine_accelerators([])
         _acc.set_reduction_accelerators(['cub'])
예제 #6
0
    def test_can_use_accelerator_set_unset(self):
        # ensure we use CUB block reduction and not CUB device reduction
        old_routine_accelerators = _accelerator.get_routine_accelerators()
        _accelerator.set_routine_accelerators([])

        a = cupy.random.random((10, 10))
        # this is the only function we can mock; the rest is cdef'd
        func = ''.join(('cupy.core._cub_reduction.',
                        '_SimpleCubReductionKernel_get_cached_function'))
        with testing.AssertFunctionIsCalled(func):
            a.sum()
        with testing.AssertFunctionIsCalled(func):
            a.sum(axis=1)
        with testing.AssertFunctionIsCalled(func, times_called=0):
            a.sum(axis=0)

        _accelerator.set_routine_accelerators(old_routine_accelerators)
예제 #7
0
 def setUp(self):
     self.old_accelerators = _accelerator.get_routine_accelerators()
     _accelerator.set_routine_accelerators(['cub'])
예제 #8
0
 def test_max_nan(self, xp, dtype):
     if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
         pytest.skip()
     a = xp.array([float('nan'), 1, -1], dtype)
     return a.max()
예제 #9
0
파일: csr.py 프로젝트: jakirkham/cupy
 def __mul__(self, other):
     if cupy.isscalar(other):
         self.sum_duplicates()
         return self._with_data(self.data * other)
     elif isspmatrix_csr(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm2'):
             return cusparse.csrgemm2(self, other)
         elif cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other)
         else:
             raise NotImplementedError
     elif csc.isspmatrix_csc(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other.T, transb=True)
         elif cusparse.check_availability('csrgemm2'):
             b = other.tocsr()
             b.sum_duplicates()
             return cusparse.csrgemm2(self, b)
         else:
             raise NotImplementedError
     elif base.isspmatrix(other):
         return self * other.tocsr()
     elif base.isdense(other):
         if other.ndim == 0:
             self.sum_duplicates()
             return self._with_data(self.data * other)
         elif other.ndim == 1:
             self.sum_duplicates()
             other = cupy.asfortranarray(other)
             # need extra padding to ensure not stepping on the CUB bug,
             # see cupy/cupy#3679 for discussion
             is_cub_safe = (self.indptr.data.mem.size >
                            self.indptr.size * self.indptr.dtype.itemsize)
             for accelerator in _accelerator.get_routine_accelerators():
                 if (accelerator == _accelerator.ACCELERATOR_CUB
                         and is_cub_safe and other.flags.c_contiguous):
                     return cub.device_csrmv(self.shape[0], self.shape[1],
                                             self.nnz, self.data,
                                             self.indptr, self.indices,
                                             other)
             if (cusparse.check_availability('csrmvEx') and self.nnz > 0
                     and cusparse.csrmvExIsAligned(self, other)):
                 # csrmvEx does not work if nnz == 0
                 csrmv = cusparse.csrmvEx
             elif cusparse.check_availability('csrmv'):
                 csrmv = cusparse.csrmv
             elif cusparse.check_availability('spmv'):
                 csrmv = cusparse.spmv
             else:
                 raise NotImplementedError
             return csrmv(self, other)
         elif other.ndim == 2:
             self.sum_duplicates()
             if cusparse.check_availability('csrmm2'):
                 csrmm = cusparse.csrmm2
             elif cusparse.check_availability('spmm'):
                 csrmm = cusparse.spmm
             else:
                 raise NotImplementedError
             return csrmm(self, cupy.asfortranarray(other))
         else:
             raise ValueError('could not interpret dimensions')
     else:
         return NotImplemented
예제 #10
0
 def setUp(self):
     self.old_accelerators = _accelerator.get_routine_accelerators()
     if self.enable_cub:
         _accelerator.set_routine_accelerators(['cub'])
     else:
         _accelerator.set_routine_accelerators([])
예제 #11
0
 def setUp(self):
     self.old_accelerators = _acc.get_routine_accelerators()
     _acc.set_routine_accelerators([])
     # also avoid fallback to CUB via the general reduction kernel
     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
     _acc.set_reduction_accelerators([])
예제 #12
0
 def test_argmin_nan(self, xp, dtype):
     if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
         pytest.skip()
     a = xp.array([float('nan'), 1, -1], dtype, order=self.order)
     return a.argmin()
예제 #13
0
파일: test_order.py 프로젝트: zelo2/cupy
 def test_ptp_all_nan(self, xp, dtype):
     if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
         pytest.skip()
     a = xp.array([float('nan'), float('nan')], dtype)
     return xp.ptp(a)
예제 #14
0
 def __mul__(self, other):
     if cupy.isscalar(other):
         self.sum_duplicates()
         return self._with_data(self.data * other)
     elif isspmatrix_csr(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm2'):
             return cusparse.csrgemm2(self, other)
         elif cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other)
         else:
             raise NotImplementedError
     elif csc.isspmatrix_csc(other):
         self.sum_duplicates()
         other.sum_duplicates()
         if cusparse.check_availability('csrgemm'):
             return cusparse.csrgemm(self, other.T, transb=True)
         elif cusparse.check_availability('csrgemm2'):
             b = other.tocsr()
             b.sum_duplicates()
             return cusparse.csrgemm2(self, b)
         else:
             raise NotImplementedError
     elif base.isspmatrix(other):
         return self * other.tocsr()
     elif base.isdense(other):
         if other.ndim == 0:
             self.sum_duplicates()
             return self._with_data(self.data * other)
         elif other.ndim == 1:
             self.sum_duplicates()
             other = cupy.asfortranarray(other)
             # csrmvEx does not work if nnz == 0
             if self.nnz > 0 and cusparse.csrmvExIsAligned(self, other):
                 for accelerator in _accelerator.get_routine_accelerators():
                     if (accelerator == _accelerator.ACCELERATOR_CUB
                             and other.flags.c_contiguous):
                         return cub.device_csrmv(self.shape[0],
                                                 self.shape[1], self.nnz,
                                                 self.data, self.indptr,
                                                 self.indices, other)
                 return cusparse.csrmvEx(self, other)
             else:
                 if cusparse.check_availability('csrmv'):
                     csrmv = cusparse.csrmv
                 elif cusparse.check_availability('spmv'):
                     csrmv = cusparse.spmv
                 else:
                     raise NotImplementedError
                 return csrmv(self, other)
         elif other.ndim == 2:
             self.sum_duplicates()
             if cusparse.check_availability('csrmm2'):
                 csrmm = cusparse.csrmm2
             elif cusparse.check_availability('spmm'):
                 csrmm = cusparse.spmm
             else:
                 raise NotImplementedError
             return csrmm(self, cupy.asfortranarray(other))
         else:
             raise ValueError('could not interpret dimensions')
     else:
         return NotImplemented
예제 #15
0
파일: histogram.py 프로젝트: zhaohb/cupy
def histogram(x, bins=10, range=None, weights=None, density=False):
    """Computes the histogram of a set of data.

    Args:
        x (cupy.ndarray): Input array.
        bins (int or cupy.ndarray): If ``bins`` is an int, it represents the
            number of bins. If ``bins`` is an :class:`~cupy.ndarray`, it
            represents a bin edges.
        range (2-tuple of float, optional): The lower and upper range of the
            bins.  If not provided, range is simply ``(x.min(), x.max())``.
            Values outside the range are ignored. The first element of the
            range must be less than or equal to the second. `range` affects the
            automatic bin computation as well. While bin width is computed to
            be optimal based on the actual data within `range`, the bin count
            will fill the entire range including portions containing no data.
        density (bool, optional): If False, the default, returns the number of
            samples in each bin. If True, returns the probability *density*
            function at the bin, ``bin_count / sample_count / bin_volume``.
        weights (cupy.ndarray, optional): An array of weights, of the same
            shape as `x`.  Each value in `x` only contributes its associated
            weight towards the bin count (instead of 1).
    Returns:
        tuple: ``(hist, bin_edges)`` where ``hist`` is a :class:`cupy.ndarray`
        storing the values of the histogram, and ``bin_edges`` is a
        :class:`cupy.ndarray` storing the bin edges.

    .. warning::

        This function may synchronize the device.

    .. seealso:: :func:`numpy.histogram`
    """

    if x.dtype.kind == 'c':
        # TODO(unno): comparison between complex numbers is not implemented
        raise NotImplementedError('complex number is not supported')

    if not isinstance(x, cupy.ndarray):
        raise ValueError('x must be a cupy.ndarray')

    x, weights = _ravel_and_check_weights(x, weights)
    bin_edges = _get_bin_edges(x, bins, range)

    if weights is None:
        y = cupy.zeros(bin_edges.size - 1, dtype='l')
        for accelerator in _accelerator.get_routine_accelerators():
            # CUB uses int for bin counts
            # TODO(leofang): support >= 2^31 elements in x?
            if (accelerator == _accelerator.ACCELERATOR_CUB
                    and x.size <= 0x7fffffff and bin_edges.size <= 0x7fffffff):
                # Need to ensure the dtype of bin_edges as it's needed for both
                # the CUB call and the correction later
                assert isinstance(bin_edges, cupy.ndarray)
                if numpy.issubdtype(x.dtype, numpy.integer):
                    bin_type = numpy.float
                else:
                    bin_type = numpy.result_type(bin_edges.dtype, x.dtype)
                    if (bin_type == numpy.float16
                            and not common._is_fp16_supported()):
                        bin_type = numpy.float32
                    x = x.astype(bin_type, copy=False)
                acc_bin_edge = bin_edges.astype(bin_type, copy=True)
                # CUB's upper bin boundary is exclusive for all bins, including
                # the last bin, so we must shift it to comply with NumPy
                if x.dtype.kind in 'ui':
                    acc_bin_edge[-1] += 1
                elif x.dtype.kind == 'f':
                    last = acc_bin_edge[-1]
                    acc_bin_edge[-1] = cupy.nextafter(last, last + 1)
                if runtime.is_hip:
                    y = y.astype(cupy.uint64, copy=False)
                y = cub.device_histogram(x, acc_bin_edge, y)
                if runtime.is_hip:
                    y = y.astype(cupy.int64, copy=False)
                break
        else:
            _histogram_kernel(x, bin_edges, bin_edges.size, y)
    else:
        simple_weights = (cupy.can_cast(weights.dtype, cupy.float64)
                          or cupy.can_cast(weights.dtype, cupy.complex128))
        if not simple_weights:
            # object dtype such as Decimal are supported in NumPy, but not here
            raise NotImplementedError(
                'only weights with dtype that can be cast to float or complex '
                'are supported')
        if weights.dtype.kind == 'c':
            y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128)
            _weighted_histogram_kernel(x, bin_edges, bin_edges.size,
                                       weights.real, y.real)
            _weighted_histogram_kernel(x, bin_edges, bin_edges.size,
                                       weights.imag, y.imag)
        else:
            if weights.dtype.kind in 'bui':
                y = cupy.zeros(bin_edges.size - 1, dtype=int)
            else:
                y = cupy.zeros(bin_edges.size - 1, dtype=cupy.float64)
            _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights,
                                       y)

    if density:
        db = cupy.array(cupy.diff(bin_edges), cupy.float64)
        return y / db / y.sum(), bin_edges
    return y, bin_edges