def test_copy_multi_device_with_stream(self): # Kernel that takes long enough then finally writes values. src = _test_copy_multi_device_with_stream_src if runtime.is_hip and driver.get_build_version() >= 5_00_00000: src = '#include <ctime>\n' + src kern = cupy.RawKernel(src, 'wait_and_write') # Allocates a memory and launches the kernel on a device with its # stream. with cuda.Device(0): # Keep this stream alive over the D2D copy below for HIP with cuda.Stream() as s1: # NOQA a = cupy.zeros((2,), dtype=numpy.uint64) kern((1,), (1,), a) # D2D copy to another device with another stream should get the # original values of the memory before the kernel on the first device # finally makes the write. with cuda.Device(1): with cuda.Stream(): b = a.copy() testing.assert_array_equal( b, numpy.array([0, 0], dtype=numpy.uint64))
def __mul__(self, other): if cupy.isscalar(other): self.sum_duplicates() return self._with_data(self.data * other) elif isspmatrix_csr(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm2'): return cusparse.csrgemm2(self, other) elif cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other) else: raise NotImplementedError elif csc.isspmatrix_csc(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other.T, transb=True) elif cusparse.check_availability('csrgemm2'): b = other.tocsr() b.sum_duplicates() return cusparse.csrgemm2(self, b) else: raise NotImplementedError elif base.isspmatrix(other): return self * other.tocsr() elif base.isdense(other): if other.ndim == 0: self.sum_duplicates() return self._with_data(self.data * other) elif other.ndim == 1: self.sum_duplicates() other = cupy.asfortranarray(other) # need extra padding to ensure not stepping on the CUB bug, # see cupy/cupy#3679 for discussion is_cub_safe = (self.indptr.data.mem.size > self.indptr.size * self.indptr.dtype.itemsize) # CUB spmv is buggy since CUDA 11.0, see # https://github.com/cupy/cupy/issues/3822#issuecomment-782607637 is_cub_safe &= (driver.get_build_version() < 11000) for accelerator in _accelerator.get_routine_accelerators(): if (accelerator == _accelerator.ACCELERATOR_CUB and is_cub_safe and other.flags.c_contiguous): return cub.device_csrmv(self.shape[0], self.shape[1], self.nnz, self.data, self.indptr, self.indices, other) if (cusparse.check_availability('csrmvEx') and self.nnz > 0 and cusparse.csrmvExIsAligned(self, other)): # csrmvEx does not work if nnz == 0 csrmv = cusparse.csrmvEx elif cusparse.check_availability('csrmv'): csrmv = cusparse.csrmv elif cusparse.check_availability('spmv'): csrmv = cusparse.spmv else: raise NotImplementedError return csrmv(self, other) elif other.ndim == 2: self.sum_duplicates() if cusparse.check_availability('csrmm2'): csrmm = cusparse.csrmm2 elif cusparse.check_availability('spmm'): csrmm = cusparse.spmm else: raise NotImplementedError return csrmm(self, cupy.asfortranarray(other)) else: raise ValueError('could not interpret dimensions') else: return NotImplemented
from cupy_backends.cuda.libs import nvrtc # NOQA from cupy_backends.cuda.libs import profiler # NOQA _available = None class _UnavailableModule(): available = False def __init__(self, name): self.__name__ = name from cupy.cuda import cub # NOQA if not runtime.is_hip and driver.get_build_version() > 0: from cupy.cuda import jitify # NOQA else: jitify = None try: from cupy_backends.cuda.libs import nvtx # NOQA except ImportError: nvtx = _UnavailableModule('cupy.cuda.nvtx') try: from cupy.cuda import thrust # NOQA except ImportError: thrust = _UnavailableModule('cupy.cuda.thrust')
import os import re import shutil import subprocess import sys import tempfile from cupy.cuda import device from cupy.cuda import function from cupy_backends.cuda.api import driver from cupy_backends.cuda.api import runtime from cupy_backends.cuda.libs import nvrtc from cupy import _util if not runtime.is_hip: _cuda_version = driver.get_build_version() if _cuda_version > 0: from cupy.cuda.jitify import jitify _nvrtc_version = None _win32 = sys.platform.startswith('win32') _rdc_flags = ('--device-c', '-dc', '-rdc=true', '--relocatable-device-code=true') _cudadevrt = None class NVCCException(Exception): pass
def _compile_with_cache_hip(source, options, arch, cache_dir, extra_source, backend='hiprtc', name_expressions=None, log_stream=None, cache_in_memory=False, use_converter=True): global _empty_file_preprocess_cache # TODO(leofang): this might be possible but is currently undocumented if _is_cudadevrt_needed(options): raise ValueError('separate compilation is not supported in HIP') # HIP's equivalent of -ftz=true, see ROCm-Developer-Tools/HIP#2252 # Notes: # - For hipcc, this should just work, as invalid options would cause errors # See https://clang.llvm.org/docs/ClangCommandLineReference.html. # - For hiprtc, this is a no-op until the compiler options like -D and -I # are accepted, see ROCm-Developer-Tools/HIP#2182 and # ROCm-Developer-Tools/HIP#2248 options += ('-fcuda-flush-denormals-to-zero',) # Workaround ROCm 4.3 LLVM_PATH issue in hipRTC #5689 rocm_build_version = driver.get_build_version() if rocm_build_version >= 40300000 and rocm_build_version < 40500000: options += ( '-I' + get_rocm_path() + '/llvm/lib/clang/13.0.0/include/',) if cache_dir is None: cache_dir = get_cache_dir() # As of ROCm 3.5.0 hiprtc/hipcc can automatically pick up the # right arch without setting HCC_AMDGPU_TARGET, so we don't need # to tell the compiler which arch we are targeting. But, we still # need to know arch as part of the cache key: if arch is None: # On HIP, gcnArch is computed from "compute capability": # https://github.com/ROCm-Developer-Tools/HIP/blob/rocm-4.0.0/rocclr/hip_device.cpp#L202 arch = device.Device().compute_capability if use_converter: source = _convert_to_hip_source(source, extra_source, is_hiprtc=(backend == 'hiprtc')) env = (arch, options, _get_nvrtc_version(), backend) base = _empty_file_preprocess_cache.get(env, None) if base is None: # This is for checking HIPRTC/HIPCC compiler internal version if backend == 'hiprtc': base = _preprocess_hiprtc('', options) else: base = _preprocess_hipcc('', options) _empty_file_preprocess_cache[env] = base key_src = '%s %s %s %s' % (env, base, source, extra_source) key_src = key_src.encode('utf-8') name = _hash_hexdigest(key_src) + '.hsaco' mod = function.Module() if not cache_in_memory: # Read from disk cache if not os.path.isdir(cache_dir): os.makedirs(cache_dir, exist_ok=True) # To handle conflicts in concurrent situation, we adopt lock-free # method to avoid performance degradation. # We force recompiling to retrieve C++ mangled names if so desired. path = os.path.join(cache_dir, name) if os.path.exists(path) and not name_expressions: with open(path, 'rb') as f: data = f.read() if len(data) >= _hash_length: hash_value = data[:_hash_length] binary = data[_hash_length:] binary_hash = _hash_hexdigest(binary).encode('ascii') if hash_value == binary_hash: mod.load(binary) return mod else: # Enforce compiling -- the resulting kernel will be cached elsewhere, # so we do nothing pass if backend == 'hiprtc': # compile_using_nvrtc calls hiprtc for hip builds binary, mapping = compile_using_nvrtc( source, options, arch, name + '.cu', name_expressions, log_stream, cache_in_memory) mod._set_mapping(mapping) else: binary = compile_using_hipcc(source, options, arch, log_stream) if not cache_in_memory: # Write to disk cache binary_hash = _hash_hexdigest(binary).encode('ascii') # shutil.move is not atomic operation, so it could result in a # corrupted file. We detect it by appending a hash at the beginning # of each cache file. If the file is corrupted, it will be ignored # next time it is read. with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False) as tf: tf.write(binary_hash) tf.write(binary) temp_path = tf.name shutil.move(temp_path, path) # Save .cu source file along with .hsaco if _get_bool_env_variable('CUPY_CACHE_SAVE_CUDA_SOURCE', False): with open(path + '.cpp', 'w') as f: f.write(source) else: # we don't do any disk I/O pass mod.load(binary) return mod
def __mul__(self, other): if cupy.isscalar(other): self.sum_duplicates() return self._with_data(self.data * other) elif cupyx.scipy.sparse.isspmatrix_csr(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm') and not runtime.is_hip: # trans=True is still buggy as of ROCm 4.2.0 a = self.T return cusparse.csrgemm(a, other, transa=True) elif cusparse.check_availability('csrgemm2'): a = self.tocsr() a.sum_duplicates() return cusparse.csrgemm2(a, other) else: raise NotImplementedError elif isspmatrix_csc(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm') and not runtime.is_hip: # trans=True is still buggy as of ROCm 4.2.0 a = self.T b = other.T return cusparse.csrgemm(a, b, transa=True, transb=True) elif cusparse.check_availability('csrgemm2'): a = self.tocsr() b = other.tocsr() a.sum_duplicates() b.sum_duplicates() return cusparse.csrgemm2(a, b) else: raise NotImplementedError elif cupyx.scipy.sparse.isspmatrix(other): return self * other.tocsr() elif _base.isdense(other): if other.ndim == 0: self.sum_duplicates() return self._with_data(self.data * other) elif other.ndim == 1: self.sum_duplicates() if (cusparse.check_availability('csrmv') and (not runtime.is_hip or driver.get_build_version() >= 5_00_00000)): # trans=True is buggy as of ROCm 4.2.0 csrmv = cusparse.csrmv elif (cusparse.check_availability('spmv') and not runtime.is_hip): # trans=True is buggy as of ROCm 4.2.0 # (I got HIPSPARSE_STATUS_INTERNAL_ERROR...) csrmv = cusparse.spmv else: raise NotImplementedError return csrmv(self.T, cupy.asfortranarray(other), transa=True) elif other.ndim == 2: self.sum_duplicates() if (cusparse.check_availability('csrmm2') and (not runtime.is_hip or driver.get_build_version() >= 5_00_00000)): # trans=True is buggy as of ROCm 4.2.0 csrmm = cusparse.csrmm2 elif cusparse.check_availability('spmm'): csrmm = cusparse.spmm else: raise NotImplementedError return csrmm(self.T, cupy.asfortranarray(other), transa=True) else: raise ValueError('could not interpret dimensions') else: return NotImplemented