def init_reikna(self): if REIK: if CUDA: self.api = cluda.cuda_api() else: self.api = cluda.ocl_api() self.dev = self.api.get_platforms()[0].get_devices()[0]
def _initialize(self): if self._pid != os.getpid(): print 'Initializing CUDA thread for pid', os.getpid() self._cluda_api = cuda_api() self._thr = self._cluda_api.Thread.create() self._fftc = {} self._pid = os.getpid()
def __initialize_gpu(self): try: import reikna.cluda as cluda from reikna.fft import FFT dtype = numpy.complex64 data = numpy.zeros( self.st['Kd'],dtype=dtype) data2 = numpy.empty_like(data) api = cluda.cuda_api() self.thr = api.Thread.create() self.data_dev = self.thr.to_device(data) self.data_rec = self.thr.to_device(data2) axes=range(0,numpy.size(self.st['Kd'])) myfftf= FFT( data, axes=axes) self.myfft = myfftf.compile(self.thr) self.cuda_flag=1 print('create gpu fft?',self.cuda_flag) print('line 642') W= self.st['q'][...,0] print('line 645') self.W = numpy.reshape(W, self.st['Kd'],order='C') print('line 647') # self.thr2 = api.Thread.create() print('line 649') self.W_dev = self.thr.to_device(self.W.astype(dtype)) print('line 652') except: self.cuda_flag=0 print('get error, using cpu')
def __init__(self, *args, **kwargs): self.api = cluda.cuda_api() super(CUDAContext, self).__init__(*args, **kwargs) # work-around: # thread synchronization does not involve CUBLAS. # cublas gets synchronized as soon as there is a memory transfer # thus we need a tiny array which is there just for synchronization self._sync_array = self.thread.array((1, ), dtype=numpy.float32)
def __init__(self, *args, **kwargs): self.api = cluda.cuda_api() super(CUDAContext, self).__init__(*args, **kwargs) # work-around: # thread synchronization does not involve CUBLAS. # cublas gets synchronized as soon as there is a memory transfer # thus we need a tiny array which is there just for synchronization self._sync_array = self.thread.array((1,), dtype=numpy.float32)
def modified_gemm_gpu(A, B, C): shape = (A.shape[0], B.shape[1]) api = cluda.cuda_api() thr = api.Thread.create() res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype) mul = MatrixMul(A, B, out_arr=res_arr) mulc = mul.compile(thr) mulc(res_arr, A, B) return res_arr + C
def test_fft(): api = cluda.cuda_api() thr = api.Thread.create() N = 256 M = 10000 #data_in = np.random.rand(N, N) + 1j*np.random.rand(N, N) data_in = np.random.rand(N, N).astype('complex') cl_data_in = thr.to_device(data_in) cl_data_out = thr.empty_like(cl_data_in) fft = FFT(thr).prepare_for(cl_data_out, cl_data_in, -1, axes=(0, ))
def __init__(self, diffs, coords, mask, probe, sample, sample_support, pmod_int = False): """Initialise the Ptychography module with the data in 'inputDir' Naming convention: coords_100x2.raw list of y, x coordinates in np.float64 pixel units diffs_322x256x512.raw 322 (256,512) diffraction patterns in np.float64 The zero pixel must be at [0, 0] and there must be an equal no. of postive and negative frequencies mask_256x512.raw (optional) mask for the diffraction data np.float64 probeInit_256x512 (optional) Initial estimate for the probe np.complex128 sampleInit_1024x2048 (optional) initial estimate for the sample np.complex128 also sets the field of view If not present then initialise with random numbers """ # # Get the shape shape = diffs[0].shape # # Store these values self.exits = makeExits(sample, probe, coords) # # This will save time later self.diffAmps = bg.quadshift(np.sqrt(diffs)) self.shape = shape self.shape_sample = sample.shape self.coords = coords self.mask = bg.quadshift(mask) self.probe = probe self.sample = sample self.alpha_div = 1.0e-10 self.error_mod = [] self.error_sup = [] self.error_conv = [] self.probe_sum = None self.sample_sum = None self.diffNorm = np.sum(self.mask * (self.diffAmps)**2) self.pmod_int = pmod_int self.sample_support = sample_support # # create a gpu thread api = cluda.cuda_api() self.thr = api.Thread.create() # # send the diffraction amplitudes, the exit waves and the mask to the gpu self.diffAmps_gpu = self.thr.to_device(self.diffAmps) * np.sqrt(float(self.diffAmps.shape[1]) * float(self.diffAmps.shape[2])) self.exits_gpu = self.thr.to_device(self.exits) mask2 = np.zeros_like(diffs, dtype=np.complex128) mask2[:] = self.mask.astype(np.complex128) self.mask_gpu = self.thr.to_device(mask2) # # compile the fft routine fft = FFT(self.diffAmps_gpu.astype(np.complex128), axes=(1,2)) self.fftc = fft.compile(self.thr, fast_math=True)
def initialize_gpu(self): try: import reikna.cluda as cluda from reikna.fft import FFT data = numpy.zeros( self.st['Kd'],dtype=dtype) print('get_platform') api = cluda.ocl_api() print('api=',api== cluda.cuda_api()) self.gpu_api = 'opencl' self.thr = api.Thread.create(async=True) print('line 630') self.data_dev = self.thr.to_device(data) axes=range(0,numpy.size(self.st['Kd'])) print('line 635') myfft= FFT( data, axes=axes) print('line 640') self.myfft = myfft.compile(self.thr,fast_math=True) print('line 640') self.gpu_flag=1 print('create gpu fft?',self.gpu_flag) print('line 642')# self.data_rec = self.thr.to_device(data2) W= self.st['w'][...,0] print('line 645') self.W = numpy.reshape(W, self.st['Kd'],order='C') print('line 647') # self.thr2 = api.Thread.create() print('line 649') self.W_dev = self.thr.to_device(self.W.astype(dtype)) self.W2_dev = self.thr.to_device(self.W.astype(dtype)) self.tmp_dev = self.thr.to_device(self.W.astype(dtype)) # device memory # self.tmp2_dev = self.thr.to_device(1.0/self.W.astype(dtype)) # device memory self.gpu_flag=1 # if self.debug > 0: print('line 652') except: self.gpu_flag=0 # if self.debug > 0: print('get error, using cpu')
def process(hdr_fiename, filename): api = cluda.cuda_api() thr = api.Thread.create() X = thr.array((10, 32768 * 2), dtype=numpy.complex128) iq_data = TCAPData(filename, hdr_fiename) file_counter = int(iq_data.filename_wo_ext[-3:]) fs = 312500 file_length_in_sec = 15625 * 32768 / fs time_passed_upto_now = (file_counter - 1) * file_length_in_sec # extract hour min sec hr, placeholder = divmod(time_passed_upto_now, 3600) mnt, sec = divmod(placeholder, 60) total_time = '{}h-{}m-{}s'.format(int(hr), int(mnt), int(sec)) title = 'Time: {}:{}:{}'.format(int(hr), int(mnt), int(sec)) zz = np.array([]) for j in range(1, 780 * 2 * 10 + 1, 2 * 10): data = np.array([]) # read 2*10 i.e. 20 blocks for i in range(j, j + 2 * 10): data = np.append(data, iq_data.read_block(i)) data = np.reshape(data, (10, 32768 * 2)) x = thr.to_device(data) fft = FFT(x, axes=(1, )) fftc = fft.compile(thr) fftc(X, x, 0) data_fft = X #data_fft = np.fft.fft(data, axis=1) data_fft = np.average(data_fft, axis=0) data_fft = np.abs(np.fft.fftshift(data_fft)) zz = np.append(zz, data_fft) zz = np.reshape(zz, (780, 32768 * 2)) data_fft_freqs = np.fft.fftshift(np.fft.fftfreq(32768 * 2, d=1 / fs)) # in Hz xx, yy = np.meshgrid(data_fft_freqs, np.arange(780)) yy = yy * 2.10 # in seconds plt_filename = '{}_{}'.format(iq_data.filename_wo_ext, total_time) print('Printing into file: ' + plt_filename) plot_spectrogram(xx, yy, zz, dbm=False, cmap=cm.jet, filename=plt_filename, dpi=500, title=title)
def initialize_gpu(backend, **kwargs): ''' Initialize a new GPU context. :param backend: backend to use. It must be any of "cuda" or "opencl". :type backend: str :param kwargs: configuration for the device lookup (see below for details). :type kwargs: dict * *interactive*: (bool) whether to select the device manually (defaults to False). * *device*: (int) number of the device to use (defaults to None). .. note:: The device can be selected using the MINKIT_DEVICE environment variable. ''' from reikna import cluda if backend == CUDA: api = cluda.cuda_api() elif backend == OPENCL: api = cluda.ocl_api() else: raise ValueError(f'Unknown backend type "{backend}"') # Get all available devices platforms = api.get_platforms() all_devices = [(p, d) for p in platforms for d in p.get_devices()] # Determine the device to use idev = device_lookup(all_devices, **kwargs) platform, device = all_devices[idev] logger.info( f'Selected device "{device.name}" ({idev}) (platform: {platform.name})' ) return Context(api, device, backend)
import pycuda.driver as cuda import pycuda.autoinit as autoinit from pycuda.compiler import SourceModule from pycuda.elementwise import ElementwiseKernel import reikna.cluda as cluda from reikna.fft import FFT ctx = autoinit.context if __name__ == '__main__': api = cluda.cuda_api() thr = api.Thread.create() size = (256,256,256) units = (.1,)*3 lam = .5 u0 = None n0 = 1. dn = np.zeros(size[::-1],np.complex64) clock = StopWatch() clock.tic("setup") Nx, Ny, Nz = size
def __init__(self, API = None, platform_number=None, device_number=None): """ Constructor. :param API: The API for the heterogeneous system. API='cuda' or API='ocl' :param platform_number: The number of the platform found by the API. :param device_number: The number of the device found on the platform. :type API: string :type platform_number: integer :type device_number: integer :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> NufftObj = pynufft.NUFFT_hsa(API='cuda', 0, 0) """ # pass self.dtype = numpy.complex64 # NUFFT_cpu.__init__(self) import reikna.cluda as cluda print('API = ', API) self.cuda_flag, self.ocl_flag = helper.diagnose() if None is API: if self.cuda_flag is 1: API = 'cuda' elif self.ocl_flag is 1: API = 'ocl' else: print('No accelerator is available.') else: api = API print('now using API = ', API) if platform_number is None: platform_number = 0 if device_number is None: device_number = 0 from reikna import cluda import reikna.transformations from reikna.cluda import functions, dtypes try: # try to create api/platform/device using the given parameters if 'cuda' == API: api = cluda.cuda_api() elif 'ocl' == API: api = cluda.ocl_api() platform = api.get_platforms()[platform_number] device = platform.get_devices()[device_number] except: # if failed, find out what's going wrong? print('No accelerator is detected.') # return 1 # Create context from device self.thr = api.Thread(device) #pyopencl.create_some_context() print('Using opencl or cuda = ', self.thr.api) # print('Using opencl? ', self.thr.api is reikna.cluda.ocl) # """ # Wavefront: as warp in cuda. Can control the width in a workgroup # Wavefront is required in spmv_vector as it improves data coalescence. # see cCSR_spmv and zSparseMatVec # """ self.wavefront = api.DeviceParameters(device).warp_size print('wavefront of OpenCL (as warp in CUDA) = ',self.wavefront) from ..src.re_subroutine import create_kernel_sets kernel_sets = create_kernel_sets(API) prg = self.thr.compile(kernel_sets, render_kwds=dict(LL = str(self.wavefront)), fast_math=False) self.prg = prg print("Note: In the future the api will change!") print("You have been warned!")
import time import numpy as np from reikna import cluda from reikna.fft import fft from numpy.linalg import norm #reikna fft only works with complex numbers dtype = np.complex64 #getting the cuda api api = cluda.cuda_api() thr = api.Thread.create() shape = (512, 2, 544, 3) #sending the array from host to array and creating result array on device a = np.random.randn(*shape).astype(dtype) a_dev = thr.to_device(a) res_dev = thr.array(shape, dtype=dtype) #reikna version of fft (compilation sends it to the thread) lastAxis = len(shape) - 1 t0 = time.time() fft = fft.FFT(a, axes=(lastAxis, )) fftc = fft.compile(thr) fftc(res_dev, a_dev) t1 = time.time() print('Time 1: ', t1 - t0) #tried to run compiled fft function with different input (it worked!) b = np.random.randn(*shape).astype(dtype)
device for device in Device.__dict__.keys() if device[0] != "_" ]: setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device])) setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device])) # this registers all the operations def _register_ops(namespace, device=Device.CPU): for name, cls in inspect.getmembers(namespace, inspect.isclass): if name[0] != "_": register(name.lower(), cls, device=device) from tinygrad import ops_cpu _register_ops(ops_cpu) try: import reikna.cluda as cluda from tinygrad import ops_gpu _register_ops(ops_gpu, device=Device.GPU) api = cluda.cuda_api() if os.environ.get( "GPAPI", "opencl") == "cuda" else cluda.ocl_api() thr = api.Thread.create() GPU = True except ImportError: # no GPU support GPU = False ANE = False
def __init__(self,inputSize, axes=(-1,),mode="pyfftw",dtype="complex64", direction="FORWARD",fftw_FLAGS=("FFTW_MEASURE","FFTW_DESTROY_INPUT"), THREADS=None, loggingLevel=None): self.axes = axes self.direction=direction if loggingLevel: logger.setLoggingLevel(loggingLevel) if mode=="gpu" or mode=="gpu_ocl" or mode=="gpu_cuda": if mode == "gpu": mode = "gpu_ocl" if REIKNA_AVAILABLE: if mode=="gpu_ocl": try: reikna_api = cluda.ocl_api() self.reikna_thread = reikna_api.Thread.create() self.FFTMODE="gpu" except: logger.warning("no reikna opencl available. \ will try cuda") mode = "gpu_cuda" if mode=="gpu_cuda": try: reikna_api = cluda.cuda_api() self.reikna_thread = reikna_api.Thread.create() self.FFTMODE="gpu" except: logger.warning("no cuda available. \ Switching to pyfftw") mode = "pyfftw" else: logger.warning("No gpu algorithms available\ switching to pyfftw") mode = "pyfftw" if mode=="pyfftw": if PYFFTW_AVAILABLE: self.FFTMODE = "pyfftw" else: logger.warning("No pyfftw available. \ Defaulting to scipy.fftpack") mode = "scipy" if mode=="scipy": if SCIPY_AVAILABLE: self.FFTMODE = "scipy" else: logger.warning("No scipy available - fft won't function.") if self.FFTMODE=="gpu": if direction=="FORWARD": self.inverse=1 elif direction=="BACKWARD": self.inverse=0 self.inputData = numpy.zeros( inputSize, dtype=dtype) inputData_dev = self.reikna_thread.to_device(self.inputData) self.outputData_dev = self.reikna_thread.array(inputSize, dtype=dtype) logger.info("Generating and compiling reikna gpu fft plan...") reikna_ft = reikna.fft.FFT(inputData_dev, axes=axes) self.reikna_ft_c = reikna_ft.compile(self.reikna_thread) logger.info("Done!") if self.FFTMODE=="pyfftw": if THREADS==None: THREADS=cpu_count() #fftw_FLAGS Set the optimisation level of fftw3, #(more optimisation takes longer - but gives quicker ffts.) #Can be FFTW_ESTIMATE, FFTW_MEASURE, FFT_PATIENT, FFTW_EXHAUSTIVE n = pyfftw.simd_alignment self.inputData = pyfftw.n_byte_align_empty( inputSize,n, dtype) self.inputData[:] = numpy.zeros( inputSize, dtype=dtype) self.outputData = pyfftw.n_byte_align_empty(inputSize,n, dtype) self.outputData[:] = numpy.zeros( inputSize,dtype=dtype) logger.info("Generating fftw3 plan....\nIf this takes too long, change fftw_FLAGS.") logger.debug("currently set to: {})".format(fftw_FLAGS)) if direction=="FORWARD": self.fftwPlan = pyfftw.FFTW(self.inputData,self.outputData, axes=axes, threads=THREADS,flags=fftw_FLAGS) elif direction=="BACKWARD": self.fftwPlan = pyfftw.FFTW(self.inputData,self.outputData, direction='FFTW_BACKWARD', axes=axes, threads=THREADS,flags=fftw_FLAGS) logger.info("Done!") elif self.FFTMODE=="scipy": self.direction=direction self.inputData = numpy.zeros(inputSize,dtype=dtype) self.size=[] for i in range(len(self.axes)): self.size.append(inputSize[self.axes[i]])
def offload(self, API, platform_number=0, device_number=0): """ self.offload(): Off-load NUFFT to the opencl or cuda device(s) :param API: define the device type, which can be 'cuda' or 'ocl' :param platform_number: define which platform to be used. The default platform_number = 0. :param device_number: define which device to be used. The default device_number = 0. :type API: string :type platform_number: int :type device_number: int :return: self: instance """ from reikna import cluda import reikna.transformations from reikna.cluda import functions, dtypes try: # try to create api/platform/device using the given parameters if 'cuda' == API: api = cluda.cuda_api() elif 'ocl' == API: api = cluda.ocl_api() platform = api.get_platforms()[platform_number] device = platform.get_devices()[device_number] except: # if failed, find out what's going wrong? diagnose() return 1 # print('device = ', device) # Create context from device self.thr = api.Thread(device) #pyopencl.create_some_context() # self.queue = pyopencl.CommandQueue( self.ctx) # """ # Wavefront: as warp in cuda. Can control the width in a workgroup # Wavefront is required in spmv_vector as it improves data coalescence. # see cSparseMatVec and zSparseMatVec # """ self.wavefront = api.DeviceParameters(device).warp_size print(api.DeviceParameters(device).max_work_group_size) # print(self.wavefront) # print(type(self.wavefront)) # pyopencl.characterize.get_simd_group_size(device[0], dtype.size) from src.re_subroutine import cMultiplyScalar, cCopy, cAddScalar,cAddVec, cSparseMatVec, cSelect, cMultiplyVec, cMultiplyVecInplace, cMultiplyConjVec, cDiff, cSqrt, cAnisoShrink # import complex float routines # print(dtypes.ctype(dtype)) prg = self.thr.compile( cMultiplyScalar.R + #cCopy.R, cCopy.R + cAddScalar.R + cSelect.R +cMultiplyConjVec.R + cAddVec.R+ cMultiplyVecInplace.R +cSparseMatVec.R+cDiff.R+ cSqrt.R+ cAnisoShrink.R+ cMultiplyVec.R, render_kwds=dict( LL = str(self.wavefront)), fast_math=False) # fast_math = False) # "#define LL "+ str(self.wavefront) + " "+cSparseMatVec.R) # ), # fast_math=False) # prg2 = pyopencl.Program(self.ctx, "#define LL "+ str(self.wavefront) + " "+cSparseMatVec.R).build() self.cMultiplyScalar = prg.cMultiplyScalar # self.cMultiplyScalar.set_scalar_arg_dtypes( cMultiplyScalar.scalar_arg_dtypes) self.cCopy = prg.cCopy self.cAddScalar = prg.cAddScalar self.cAddVec = prg.cAddVec self.cSparseMatVec = prg.cSparseMatVec self.cSelect = prg.cSelect self.cMultiplyVecInplace = prg.cMultiplyVecInplace self.cMultiplyVec = prg.cMultiplyVec self.cMultiplyConjVec = prg.cMultiplyConjVec self.cDiff = prg.cDiff self.cSqrt= prg.cSqrt self.cAnisoShrink = prg.cAnisoShrink # self.xx_Kd = pyopencl.array.zeros(self.queue, self.st['Kd'], dtype=dtype, order="C") self.k_Kd = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C")) self.k_Kd2 = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C")) self.y =self.thr.to_device( numpy.zeros((self.st['M'],), dtype=dtype, order="C")) self.x_Nd = self.thr.to_device(numpy.zeros(self.st['Nd'], dtype=dtype, order="C")) # self.xx_Nd = pyopencl.array.zeros(self.queue, self.st['Nd'], dtype=dtype, order="C") self.NdCPUorder, self.KdCPUorder, self.nelem = preindex_copy(self.st['Nd'], self.st['Kd']) self.NdGPUorder = self.thr.to_device( self.NdCPUorder) self.KdGPUorder = self.thr.to_device( self.KdCPUorder) self.Ndprod = numpy.int32(numpy.prod(self.st['Nd'])) self.Kdprod = numpy.int32(numpy.prod(self.st['Kd'])) self.M = numpy.int32( self.st['M']) self.SnGPUArray = self.thr.to_device( self.sn) self.sp_data = self.thr.to_device( self.sp.data.astype(dtype)) self.sp_indices =self.thr.to_device( self.sp.indices.astype(numpy.int32)) self.sp_indptr = self.thr.to_device( self.sp.indptr.astype(numpy.int32)) self.sp_numrow = self.M del self.sp self.spH_data = self.thr.to_device( self.spH.data.astype(dtype)) self.spH_indices = self.thr.to_device( self.spH.indices) self.spH_indptr = self.thr.to_device( self.spH.indptr) self.spH_numrow = self.Kdprod del self.spH self.spHsp_data = self.thr.to_device( self.spHsp.data.astype(dtype)) self.spHsp_indices = self.thr.to_device( self.spHsp.indices) self.spHsp_indptr =self.thr.to_device( self.spHsp.indptr) self.spHsp_numrow = self.Kdprod del self.spHsp # import reikna.cluda import reikna.fft # api = # self.thr = reikna.cluda.ocl_api().Thread(self.queue) self.fft = reikna.fft.FFT(self.k_Kd, numpy.arange(0, self.ndims)).compile(self.thr, fast_math=False) # self.fft = reikna.fft.FFT(self.k_Kd).compile(thr, fast_math=True) # self.fft = FFT(self.ctx, self.queue, self.k_Kd, fast_math=True) # self.ifft = FFT(self.ctx, self.queue, self.k_Kd2, fast_math=True) self.zero_scalar=dtype(0.0+0.0j)
def __init__(self, API=None, platform_number=None, device_number=None, verbosity=0): """ Constructor. :param API: The API for the heterogeneous system. API='cuda' or API='ocl' :param platform_number: The number of the platform found by the API. :param device_number: The number of the device found on the platform. :param verbosity: Defines the verbosity level, default value is 0 :type API: string :type platform_number: integer :type device_number: integer :type verbosity: integer :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> NufftObj = pynufft.NUFFT_hsa(API='cuda', platform_number=0, device_number=0, verbosity=0) """ warnings.warn( 'In the future NUFFT_hsa and NUFFT_cpu api will' ' be merged', FutureWarning) self.dtype = numpy.complex64 self.verbosity = verbosity import reikna.cluda as cluda if self.verbosity > 0: print('The choosen API by the user is ', API) self.cuda_flag, self.ocl_flag = helper.diagnose( verbosity=self.verbosity) if None is API: if self.cuda_flag is 1: API = 'cuda' elif self.ocl_flag is 1: API = 'ocl' else: warnings.warn( 'No parallelization will be made since no GPU ' 'device has been detected.', UserWarning) else: api = API if self.verbosity > 0: print('The used API will be ', API) if platform_number is None: platform_number = 0 if device_number is None: device_number = 0 from reikna import cluda import reikna.transformations from reikna.cluda import functions, dtypes try: # try to create api/platform/device using the given parameters if 'cuda' == API: api = cluda.cuda_api() elif 'ocl' == API: api = cluda.ocl_api() platform = api.get_platforms()[platform_number] device = platform.get_devices()[device_number] except: # if failed, find out what's going wrong? warnings.warn( 'No parallelization will be made since no GPU ' 'device has been detected.', UserWarning) # return 1 # Create context from device self.thr = api.Thread(device) # pyopencl.create_some_context() self.device = device # : device name if self.verbosity > 0: print('Using opencl or cuda = ', self.thr.api) # """ # Wavefront: as warp in cuda. Can control the width in a workgroup # Wavefront is required in spmv_vector as it improves data coalescence. # see cCSR_spmv and zSparseMatVec # """ self.wavefront = api.DeviceParameters(device).warp_size if self.verbosity > 0: print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront) from ..src import re_subroutine # import create_kernel_sets kernel_sets = re_subroutine.create_kernel_sets(API) prg = self.thr.compile(kernel_sets, render_kwds=dict(LL=str(self.wavefront)), fast_math=False) self.prg = prg
def __init__(self, inputSize, axes=(-1, ), mode="pyfftw", dtype="complex64", direction="FORWARD", fftw_FLAGS=("FFTW_MEASURE", "FFTW_DESTROY_INPUT"), THREADS=None, loggingLevel=None): self.axes = axes self.direction = direction if loggingLevel: logger.setLoggingLevel(loggingLevel) if mode == "gpu" or mode == "gpu_ocl" or mode == "gpu_cuda": if mode == "gpu": mode = "gpu_ocl" if REIKNA_AVAILABLE: if mode == "gpu_ocl": try: reikna_api = cluda.ocl_api() self.reikna_thread = reikna_api.Thread.create() self.FFTMODE = "gpu" except: logger.warning("no reikna opencl available. \ will try cuda") mode = "gpu_cuda" if mode == "gpu_cuda": try: reikna_api = cluda.cuda_api() self.reikna_thread = reikna_api.Thread.create() self.FFTMODE = "gpu" except: logger.warning("no cuda available. \ Switching to pyfftw") mode = "pyfftw" else: logger.warning("No gpu algorithms available\ switching to pyfftw") mode = "pyfftw" if mode == "pyfftw": if PYFFTW_AVAILABLE: self.FFTMODE = "pyfftw" else: logger.warning("No pyfftw available. \ Defaulting to scipy.fftpack") mode = "scipy" if mode == "scipy": if SCIPY_AVAILABLE: self.FFTMODE = "scipy" else: logger.warning("No scipy available - fft won't function.") if self.FFTMODE == "gpu": if direction == "FORWARD": self.inverse = 1 elif direction == "BACKWARD": self.inverse = 0 self.inputData = numpy.zeros(inputSize, dtype=dtype) inputData_dev = self.reikna_thread.to_device(self.inputData) self.outputData_dev = self.reikna_thread.array(inputSize, dtype=dtype) logger.info("Generating and compiling reikna gpu fft plan...") reikna_ft = reikna.fft.FFT(inputData_dev, axes=axes) self.reikna_ft_c = reikna_ft.compile(self.reikna_thread) logger.info("Done!") if self.FFTMODE == "pyfftw": if THREADS == None: THREADS = cpu_count() #fftw_FLAGS Set the optimisation level of fftw3, #(more optimisation takes longer - but gives quicker ffts.) #Can be FFTW_ESTIMATE, FFTW_MEASURE, FFT_PATIENT, FFTW_EXHAUSTIVE n = pyfftw.simd_alignment self.inputData = pyfftw.n_byte_align_empty(inputSize, n, dtype) self.inputData[:] = numpy.zeros(inputSize, dtype=dtype) self.outputData = pyfftw.n_byte_align_empty(inputSize, n, dtype) self.outputData[:] = numpy.zeros(inputSize, dtype=dtype) logger.info( "Generating fftw3 plan....\nIf this takes too long, change fftw_FLAGS." ) logger.debug("currently set to: {})".format(fftw_FLAGS)) if direction == "FORWARD": self.fftwPlan = pyfftw.FFTW(self.inputData, self.outputData, axes=axes, threads=THREADS, flags=fftw_FLAGS) elif direction == "BACKWARD": self.fftwPlan = pyfftw.FFTW(self.inputData, self.outputData, direction='FFTW_BACKWARD', axes=axes, threads=THREADS, flags=fftw_FLAGS) logger.info("Done!") elif self.FFTMODE == "scipy": self.direction = direction self.inputData = numpy.zeros(inputSize, dtype=dtype) self.size = [] for i in range(len(self.axes)): self.size.append(inputSize[self.axes[i]])
def kspacegaussian_filter_CL(ksp, sigma): sz = ksp.shape dtype = np.complex64 ftype = np.float32 #api = cluda.ocl_api() api = cuda_api() thr = api.Thread.create() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) FACTOR = 1.0 program = thr.compile(""" KERNEL void gauss_kernel( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *src) { const ulong x = get_global_id(0);const ulong y = get_global_id(1);const ulong z = get_global_id(2); const SIZE_T dim1= %d; const SIZE_T dim2= %d; const SIZE_T dim3= %d; ${ftype} sigma[3]; sigma[0]=%f;sigma[1]=%f;sigma[2]=%f; ${ftype} factor = %f; const double TWOPISQ = 19.739208802178716; //6.283185307179586; //2*3.141592; const ${ftype} SQRT2PI = 2.5066282746; const double CUBEDSQRT2PI = 15.749609945722419; const ulong idx = dim1*dim2*z + dim1*y + x; ${ftype} i = (${ftype})(x); // )((x / dim3) / dim2); i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1); ${ftype} j = (${ftype})(y); //(x / dim3); if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);}; j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2); //Account for large global index (stored as ulong) before performing modulus //double pre_k=fmod((double)(x) , (double) dim3); ${ftype} k = (${ftype}) (z); //pre_k; k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3); ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1; //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2])); dest[idx].x = src[idx].x * weight; dest[idx].y = src[idx].y * weight; } """ % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR), render_kwds=dict(ctype=dtypes.ctype(dtype), ftype=dtypes.ctype(ftype), exp=functions.exp(ftype)), fast_math=True) gauss_kernel = program.gauss_kernel #data_dev = thr.empty_like(ksp_dev) gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2])) # ksp_out = data_dev.get() thr.synchronize() ## #api = cuda_api() #thr = api.Thread.create() #data_dev = thr.to_device(ksp_out) ifft = FFT(data_dev) cifft = ifft.compile(thr) cifft(data_dev, data_dev, inverse=0) result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2]) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) return result # ,ksp_out
tic() imggauss2 = kspacegaussian_filter_CL2(ksp, np.ones(3)) print 'Reikna Cuda Gaussian+recon+ Reikna FFTShift: first run' toc() pycuda.tools.clear_context_caches() tic() kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1) image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args) toc() # create two timers so we can speed-test each approach #start = drv.Event() #end = drv.Event() api = cuda_api() thr = api.Thread.create() N = 512 tic() data_dev = thr.to_device(ksp) ifft = FFT(data_dev) cifft = ifft.compile(thr) thr.synchronize() cifft(data_dev, data_dev, inverse=0) result = np.fft.fftshift(data_dev.get() / N**3) result = result[::-1, ::-1, ::-1] result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0) print "Reikna IFFT time and first three results:" print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0])))
def initialize_gpu(backend, **kwargs): ''' Initialize a new GPU context. :param backend: backend to use. It must be any of "cuda" or "opencl". :type backend: str :param kwargs: it may contain any of the following values: \ - interactive: (bool) whether to select the device manually (defaults to False) \ - device: (int) number of the device to use (defaults to None). :type kwargs: dict .. note:: The device can be selected using the MINKIT_DEVICE environment variable. ''' global BACKEND global DEVICE global CONTEXT global THREAD from reikna import cluda # Establish the backend if BACKEND is not None and backend != BACKEND: raise RuntimeError( f'Attempt to change backend from "{BACKEND}" to "{backend}"; not supported' ) elif backend == CUDA: API = cluda.cuda_api() elif backend == OPENCL: API = cluda.ocl_api() elif backend == BACKEND: # Using same backend return else: raise ValueError(f'Unknown backend type "{backend}"') BACKEND = backend # Get all available devices platforms = API.get_platforms() all_devices = [(p, d) for p in platforms for d in p.get_devices()] # Determine the device to use idev = device_lookup(all_devices, **kwargs) platform, device = all_devices[idev] logger.info( f'Selected device "{device.name}" ({idev}) (platform: {platform.name})' ) DEVICE = device # Create the context and thread if BACKEND == CUDA: CONTEXT = DEVICE.make_context() def clear_cuda_context(): from pycuda.tools import clear_context_caches CONTEXT.pop() clear_context_caches() atexit.register(clear_cuda_context) else: # OPENCL import pyopencl CONTEXT = pyopencl.Context([DEVICE]) THREAD = API.Thread(CONTEXT)