def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) if devid != 'round-robin': os.environ['CUDA_DEVICE'] = devid # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [ provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACUBLASKernels ] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) if devid != 'round-robin': os.environ['CUDA_DEVICE'] = devid # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACUBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Create a CUDA context from pycuda.autoinit import context as cuda_ctx # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. from pycuda.driver import func_cache cuda_ctx.set_cache_config(func_cache.PREFER_SHARED) # For introspection to work it must always be possible to # import the CUDABackend (even if CUDA is unavailable on the # system). As many of our types/providers depend on the CUDA # runtime we import these here, locally, at the time of # instantiation. from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.block_diag_matrix_cls = types.CUDABlockDiagMatrix self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blockmats.BlockDiagMatrixKernels, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACublasKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super(CUDABackend, self).__init__(cfg) # Create a CUDA context from pycuda.autoinit import context # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. from pycuda.driver import func_cache context.set_cache_config(func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, packing, provider, types) # Register our data types self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.mpi_matrix_cls = types.CUDAMPIMatrix self.mpi_view_cls = types.CUDAMPIView self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView # Template lookup self.lookup = DottedTemplateLookup('pyfr.backends.cuda.kernels') # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, cublas.CUDACUBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
def __init__(self, cfg): super().__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'local-rank') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) # Handle the local-rank case if devid == 'local-rank': devord = str(get_local_rank()) os.environ['CUDA_DEVICE'] = devord # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Aforementioned commented lines do not work for multiple gpus/node """ if devid == 'local-rank': import pycuda.driver as cuda cuda.init() cudadevice = cuda.Device(int(devord)) context = cudadevice.make_context() #import atexit #atexit.register(context.pop) elif devid == 'round-robin': from pycuda.autoinit import context import pycuda.driver as cuda """ # Take the required alignment to be 128 bytes self.alignb = 128 # Take the SoA size to be 32 elements self.soasz = 32 # Get the MPI runtime type self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard') if self.mpitype not in {'standard', 'cuda-aware'}: raise ValueError('Invalid CUDA backend MPI type') # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) #self.context = context from frfs.backends.cuda import (blasext, cublas, gimmik, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView self.xchg_matrix_cls = types.CUDAXchgMatrix self.xchg_view_cls = types.CUDAXchgView # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, gimmik.CUDAGiMMiKKernels, cublas.CUDACUBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
pred_obs_mean_ar[tid], &fltr_stt_mean_ar[offset_mean+wid], dim_state_ar, tid, i, iteration); fltr_stt_cov_ar_cal(&pred_stt_cov_ar[offset_cov+wid], kalman_gain_ar, &fltr_stt_cov_ar[offset_cov+wid], dim_state_ar, temp, temp1, tid); } logpdf[tid] = log_pdf(pred_obs_mean_ar[tid], pred_obs_cov_ar[tid],tid); } """) start = time.time() context.set_cache_config(cuda.func_cache.PREFER_L1) filter = mod.get_function("filter") init_stt_mean_ar = mod.get_global('init_stt_mean_ar')[0] init_stt_cov_ar = mod.get_global('init_stt_cov_ar')[0] tran_mat_ar = mod.get_global('tran_mat_ar')[0] observations_ar = mod.get_global('observations_ar')[0] cuda.memcpy_htod(init_stt_mean_ar, init_stt_mean_ar_const) cuda.memcpy_htod(init_stt_cov_ar, init_stt_cov_ar_const) cuda.memcpy_htod(tran_mat_ar, tran_mat_ar_const) cuda.memcpy_htod(observations_ar, observations_ar_const) start = time.time()