Python FFT.compile 예제들, reikna.fft.FFT.compile Python 예제들

예제 #1

0

파일 보기

파일: fessler_nufft.py 프로젝트: jyhmiinlin/cineFSE

    def __initialize_gpu(self):
        try:
            import reikna.cluda as cluda
            from reikna.fft import FFT 
#             dtype = dtype#numpy.complex64
            data = numpy.zeros( self.st['Kd'],dtype=numpy.complex64)
#             data2 = numpy.empty_like(data)
            api = cluda.ocl_api()
            self.thr = api.Thread.create(async=True)      
            self.data_dev = self.thr.to_device(data)
#             self.data_rec = self.thr.to_device(data2)
            axes=range(0,numpy.size(self.st['Kd']))
            myfft=  FFT( data, axes=axes)
            self.myfft = myfft.compile(self.thr,fast_math=True)
 
            self.gpu_flag=1
            print('create gpu fft?',self.gpu_flag)
            print('line 642')
            W= self.st['w'][...,0]
            print('line 645')
            self.W = numpy.reshape(W, self.st['Kd'],order='C')
            print('line 647')
#             self.thr2 = api.Thread.create() 
            print('line 649')
            self.W_dev = self.thr.to_device(self.W.astype(dtype))
            self.gpu_flag=1                
            print('line 652')
        except:
            self.gpu_flag=0              
            print('get error, using cpu')

예제 #2

0

파일 보기

    def __initialize_gpu(self):
        try:
            import reikna.cluda as cluda
            from reikna.fft import FFT
            #             dtype = dtype#numpy.complex64
            data = numpy.zeros(self.st['Kd'], dtype=numpy.complex64)
            #             data2 = numpy.empty_like(data)
            api = cluda.ocl_api()
            self.thr = api.Thread.create(async=True)
            self.data_dev = self.thr.to_device(data)
            #             self.data_rec = self.thr.to_device(data2)
            axes = range(0, numpy.size(self.st['Kd']))
            myfft = FFT(data, axes=axes)
            self.myfft = myfft.compile(self.thr, fast_math=True)

            self.gpu_flag = 1
            print('create gpu fft?', self.gpu_flag)
            print('line 642')
            W = self.st['w'][..., 0]
            print('line 645')
            self.W = numpy.reshape(W, self.st['Kd'], order='C')
            print('line 647')
            #             self.thr2 = api.Thread.create()
            print('line 649')
            self.W_dev = self.thr.to_device(self.W.astype(dtype))
            self.gpu_flag = 1
            print('line 652')
        except:
            self.gpu_flag = 0
            print('get error, using cpu')

예제 #3

0

파일 보기

파일: test_fft.py 프로젝트: ringw/reikna

def check_performance(thr_and_double, shape_and_axes, fast_math):
    thr, double = thr_and_double

    shape, axes = shape_and_axes
    dtype = numpy.complex128 if double else numpy.complex64

    data = get_test_array(shape, dtype)
    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data_dev)

    fft = FFT(data_dev, axes=axes)
    fftc = fft.compile(thr, fast_math=fast_math)

    attempts = 10
    t1 = time.time()
    for i in range(attempts):
        fftc(res_dev, data_dev)
    thr.synchronize()
    t2 = time.time()
    dev_time = (t2 - t1) / attempts

    fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(res_dev.get(), fwd_ref)

    return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5

예제 #4

0

파일 보기

파일: imageprocessing.py 프로젝트: ufo-kit/syris

def _fft_2(data, inverse=False, queue=None, block=True):
    """Execute FFT on *data*, which is first converted to a pyopencl array and retyped to
    complex.
    """
    if not queue:
        queue = cfg.OPENCL.queue
    thread = ocl_api().Thread(queue)
    data = g_util.get_array(data, queue=queue)
    if data.dtype != cfg.PRECISION.np_cplx:
        data = data.astype(cfg.PRECISION.np_cplx)

    if queue not in cfg.OPENCL.fft_plans:
        cfg.OPENCL.fft_plans[queue] = {}
    if data.shape not in cfg.OPENCL.fft_plans[queue]:
        LOG.debug("Creating FFT Plan for {} and shape {}".format(queue, data.shape))
        _fft = FFT(data, axes=(0, 1))
        cfg.OPENCL.fft_plans[queue][data.shape] = _fft.compile(thread, fast_math=False)
    plan = cfg.OPENCL.fft_plans[queue][data.shape]

    LOG.debug("fft_2, shape: %s, inverse: %s", data.shape, inverse)
    # plan.execute(data.data, inverse=inverse, wait_for_finish=block)
    plan(data, data, inverse=inverse)
    if block:
        thread.synchronize()

    return data

예제 #5

0

파일 보기

파일: gpuCore.py 프로젝트: CSymes/quickDDM

def createComplexFFTKernel(thread, shape):
    scaling = numpy.sqrt(shape[-2] * shape[-1])
    footprint = thread.array(shape, dtype=numpy.complex128)
    fft = FFT(footprint)

    div = div_const(footprint, scaling)
    fft.parameter.output.connect(div, div.input, output_prime=div.output)

    return fft.compile(thread)

예제 #6

0

파일 보기

파일: gpufft.py 프로젝트: 1nadequacy/kaggle_ds2

    def get_fftc(self, arr):
        self._initialize()

        shape = arr.shape
        if shape in self._fftc:
            return self._fftc[shape]

        fft = FFT(self._thr.array(shape, np.complex64))
        fftc = fft.compile(self._thr)
        self._fftc[shape] = fftc
        return fftc

예제 #7

0

파일 보기

파일: Ptychography_2dsample_2dprobe_farfield_gpu.py 프로젝트: andyofmelbourne/Ptychography

 def __init__(self, diffs, coords, mask, probe, sample, sample_support, pmod_int = False): 
     """Initialise the Ptychography module with the data in 'inputDir' 
     
     Naming convention:
     coords_100x2.raw            list of y, x coordinates in np.float64 pixel units
     diffs_322x256x512.raw       322 (256,512) diffraction patterns in np.float64
                                 The zero pixel must be at [0, 0] and there must 
                                 be an equal no. of postive and negative frequencies
     mask_256x512.raw            (optional) mask for the diffraction data np.float64
     probeInit_256x512           (optional) Initial estimate for the probe np.complex128
     sampleInit_1024x2048        (optional) initial estimate for the sample np.complex128
                                 also sets the field of view
                                 If not present then initialise with random numbers        
     """
     #
     # Get the shape
     shape  = diffs[0].shape
     #
     # Store these values
     self.exits      = makeExits(sample, probe, coords)
     #
     # This will save time later
     self.diffAmps   = bg.quadshift(np.sqrt(diffs))
     self.shape      = shape
     self.shape_sample = sample.shape
     self.coords     = coords
     self.mask       = bg.quadshift(mask)
     self.probe      = probe
     self.sample     = sample
     self.alpha_div  = 1.0e-10
     self.error_mod  = []
     self.error_sup  = []
     self.error_conv = []
     self.probe_sum  = None
     self.sample_sum = None
     self.diffNorm   = np.sum(self.mask * (self.diffAmps)**2)
     self.pmod_int   = pmod_int
     self.sample_support = sample_support
     #
     # create a gpu thread
     api               = cluda.cuda_api()
     self.thr          = api.Thread.create()
     #
     # send the diffraction amplitudes, the exit waves and the mask to the gpu
     self.diffAmps_gpu = self.thr.to_device(self.diffAmps) * np.sqrt(float(self.diffAmps.shape[1]) * float(self.diffAmps.shape[2]))
     self.exits_gpu    = self.thr.to_device(self.exits)
     mask2             = np.zeros_like(diffs, dtype=np.complex128)
     mask2[:]          = self.mask.astype(np.complex128)
     self.mask_gpu     = self.thr.to_device(mask2)
     #
     # compile the fft routine
     fft               = FFT(self.diffAmps_gpu.astype(np.complex128), axes=(1,2))
     self.fftc         = fft.compile(self.thr, fast_math=True)

예제 #8

0

파일 보기

파일: tcap_loop_gpu.py 프로젝트: xaratustrah/iq_apps

def process(hdr_fiename, filename):
    api = cluda.cuda_api()
    thr = api.Thread.create()
    X = thr.array((10, 32768 * 2), dtype=numpy.complex128)

    iq_data = TCAPData(filename, hdr_fiename)
    file_counter = int(iq_data.filename_wo_ext[-3:])
    fs = 312500
    file_length_in_sec = 15625 * 32768 / fs
    time_passed_upto_now = (file_counter - 1) * file_length_in_sec

    # extract hour min sec
    hr, placeholder = divmod(time_passed_upto_now, 3600)
    mnt, sec = divmod(placeholder, 60)
    total_time = '{}h-{}m-{}s'.format(int(hr), int(mnt), int(sec))
    title = 'Time: {}:{}:{}'.format(int(hr), int(mnt), int(sec))

    zz = np.array([])
    for j in range(1, 780 * 2 * 10 + 1, 2 * 10):
        data = np.array([])
        # read 2*10 i.e. 20 blocks
        for i in range(j, j + 2 * 10):
            data = np.append(data, iq_data.read_block(i))
        data = np.reshape(data, (10, 32768 * 2))

        x = thr.to_device(data)
        fft = FFT(x, axes=(1, ))
        fftc = fft.compile(thr)
        fftc(X, x, 0)
        data_fft = X
        #data_fft = np.fft.fft(data, axis=1)

        data_fft = np.average(data_fft, axis=0)
        data_fft = np.abs(np.fft.fftshift(data_fft))
        zz = np.append(zz, data_fft)

    zz = np.reshape(zz, (780, 32768 * 2))
    data_fft_freqs = np.fft.fftshift(np.fft.fftfreq(32768 * 2,
                                                    d=1 / fs))  # in Hz
    xx, yy = np.meshgrid(data_fft_freqs, np.arange(780))
    yy = yy * 2.10  # in seconds
    plt_filename = '{}_{}'.format(iq_data.filename_wo_ext, total_time)
    print('Printing into file: ' + plt_filename)
    plot_spectrogram(xx,
                     yy,
                     zz,
                     dbm=False,
                     cmap=cm.jet,
                     filename=plt_filename,
                     dpi=500,
                     title=title)

예제 #9

0

파일 보기

    def initialize_gpu(self):
        try:
            import reikna.cluda as cluda
            from reikna.fft import FFT 

            data = numpy.zeros( self.st['Kd'],dtype=dtype)

            print('get_platform')
            api = cluda.ocl_api()

            print('api=',api== cluda.cuda_api())

            self.gpu_api =  'opencl'
                
            self.thr = api.Thread.create(async=True)   
            print('line 630')   
            self.data_dev = self.thr.to_device(data)

            axes=range(0,numpy.size(self.st['Kd']))
            print('line 635')   
            myfft=  FFT( data, axes=axes)
            print('line 640')   
            self.myfft = myfft.compile(self.thr,fast_math=True)
            print('line 640')   
            self.gpu_flag=1

            print('create gpu fft?',self.gpu_flag)
            print('line 642')#             self.data_rec = self.thr.to_device(data2)

                
            W= self.st['w'][...,0]

            print('line 645')   
                
            self.W = numpy.reshape(W, self.st['Kd'],order='C')
            

            print('line 647')
#             self.thr2 = api.Thread.create() 
            print('line 649')
            self.W_dev = self.thr.to_device(self.W.astype(dtype))
            self.W2_dev = self.thr.to_device(self.W.astype(dtype))
            self.tmp_dev = self.thr.to_device(self.W.astype(dtype)) # device memory
#             self.tmp2_dev = self.thr.to_device(1.0/self.W.astype(dtype)) # device memory
            self.gpu_flag=1      
#             if self.debug > 0:          
            print('line 652')
        except:
            self.gpu_flag=0
#             if self.debug > 0:              
            print('get error, using cpu')

예제 #10

0

파일 보기

def reikna_fft(a, inverse=False):
    '''
    Get the FFT to calculate the FFT of an array, keeping the compiled
    source in a cache.
    '''
    global FFT_CACHE

    # Compile the FFT
    cf = FFT_CACHE.get(a.shape, None)
    if cf is None:
        f = FFT(a)
        cf = f.compile(THREAD)
        FFT_CACHE[a.shape] = cf

    # Calculate the value
    output = get_array_cache(types.cpu_complex).get_array(len(a))

    cf(output, a, inverse=inverse)

    return output

예제 #11

0

파일 보기

파일: fftmanager.py 프로젝트: iandobbie/Chromagnon

    def rfft(self, a, nthreads=ncpu):
        a = self.check_array(a, RTYPES, RTYPE)
        
        if SCIK and self.is_gpu_memory_enough(a):
            shape = [s for s in a.shape]
            shape[-1] = shape[-1]//2 + 1
            dtype = G_RTYPES[a.dtype.type]
            func = fft.fft
            af = self._fft_scik(a, func, shape, dtype)
            
        elif REIK and self.is_gpu_memory_enough(a):
            thr = self.api.Thread(self.dev)

            plan  = FFT(Type(complex_for(a.dtype), a.shape))

            # combines two real-valued inputs into a complex-valued input of the same shape
            cc = combine_complex(plan.parameter.input)
            # supplies a constant output
            bc = broadcast_const(cc.imag, 0)

            plan.parameter.input.connect(cc, cc.output, real_input=cc.real, imag_input=cc.imag)
            plan.parameter.imag_input.connect(bc, bc.output)

            fftc = plan.compile(thr, fast_math=True)

            a_dev   = thr.to_device(a)
            a_out_dev = thr.empty_like(plan.parameter.output)

            fftc(a_out_dev, a_dev)

            af = a_out_dev.get()
            af = N.fft.fftshift(af)

        elif FFTW:
            func = pyfftw.builders.rfftn

            af = self._fftw(a, func, nthreads)
        else:
            af = N.fft.rfftn(a)

        return af

예제 #12

0

파일 보기

파일: fftmanager.py 프로젝트: macronucleus/Chromagnon

    def rfft(self, a, nthreads=ncpu):
        a = self.check_array(a, RTYPES, RTYPE)
        
        if SCIK and self.is_gpu_memory_enough(a):
            shape = [s for s in a.shape]
            shape[-1] = shape[-1]//2 + 1
            dtype = G_RTYPES[a.dtype.type]
            func = fft.fft
            af = self._fft_scik(a, func, shape, dtype)
            
        elif REIK and self.is_gpu_memory_enough(a):
            thr = self.api.Thread(self.dev)

            plan  = FFT(Type(complex_for(a.dtype), a.shape))

            # combines two real-valued inputs into a complex-valued input of the same shape
            cc = combine_complex(plan.parameter.input)
            # supplies a constant output
            bc = broadcast_const(cc.imag, 0)

            plan.parameter.input.connect(cc, cc.output, real_input=cc.real, imag_input=cc.imag)
            plan.parameter.imag_input.connect(bc, bc.output)

            fftc = plan.compile(thr, fast_math=True)

            a_dev   = thr.to_device(a)
            a_out_dev = thr.empty_like(plan.parameter.output)

            fftc(a_out_dev, a_dev)

            af = a_out_dev.get()
            af = N.fft.fftshift(af)

        elif FFTW:
            func = pyfftw.builders.rfftn

            af = self._fftw(a, func, nthreads)
        else:
            af = N.fft.rfftn(a)

        return af

예제 #13

0

파일 보기

파일: test_fft.py 프로젝트: ringw/reikna

def test_trivial(some_thr):
    """
    Checks that even if the FFT is trivial (problem size == 1),
    the transformations are still attached and executed.
    """
    dtype = numpy.complex64
    shape = (128, 1, 1, 128)
    axes = (1, 2)
    param = 4

    data = get_test_array(shape, dtype)
    data_dev = some_thr.to_device(data)
    res_dev = some_thr.empty_like(data_dev)

    fft = FFT(data_dev, axes=axes)
    scale = mul_param(data_dev, numpy.int32)
    fft.parameter.input.connect(scale, scale.output, input_prime=scale.input, param=scale.param)

    fftc = fft.compile(some_thr)
    fftc(res_dev, data_dev, param)
    assert diff_is_negligible(res_dev.get(), data * param)

예제 #14

0

파일 보기

파일: fft.py 프로젝트: hbcbh1999/fluidimage

    def __init__(self, nx, ny):

        shapeX = [ny, nx]
        shapeK = [ny, nx]

        self.shapeX = shapeX
        self.arrayK = np.empty(shapeK, dtype=self.type_complex)

        # Pick the first available GPGPU API and make a Thread on it.
        api = any_api()
        # api = cuda_api()
        # api = ocl_api()
        dev = api.get_platforms()[0].get_devices()
        self.thr = api.Thread.create(dev)
        fft = FFT(self.arrayK, axes=(0, 1))
        scale = mul_param(self.arrayK, np.float)
        fft.parameter.input.connect(scale,
                                    scale.output,
                                    input_prime=scale.input,
                                    param=scale.param)
        self.fftplan = fft.compile(self.thr, fast_math=True)

        self.coef_norm = nx * ny

예제 #15

0

파일 보기

파일: test_fft.py 프로젝트: ringw/reikna

def check_errors(thr, shape_and_axes):

    dtype = numpy.complex64

    shape, axes = shape_and_axes

    data = get_test_array(shape, dtype)

    fft = FFT(data, axes=axes)
    fftc = fft.compile(thr)

    # forward transform
    # Testing inplace transformation, because if this works,
    # then the out of place one will surely work too.
    data_dev = thr.to_device(data)
    fftc(data_dev, data_dev)
    fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(data_dev.get(), fwd_ref)

    # inverse transform
    data_dev = thr.to_device(data)
    fftc(data_dev, data_dev, inverse=True)
    inv_ref = numpy.fft.ifftn(data, axes=axes).astype(dtype)
    assert diff_is_negligible(data_dev.get(), inv_ref)

예제 #16

0

파일 보기

파일: testany.py 프로젝트: xialulee/WaveSyn

             Parameter('output', Annotation(Type(dtype, size), 'o')),
             Parameter('input', Annotation(Type(dtype, size), 'i'))
        ],
        '''
        ${input.ctype} val = ${input.load_same};       
        ${output.store_same}(${polar_unit}(atan2(val.y, val.x)));
        ''',
        render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float32 if single else np.double))
    )
    return unimod

unimod = unimod_gen(size)

ffts = FFT(thr.array(size, dtype=np.complex64))
ffts.parameter.output.connect(unimod, unimod.input, uni=unimod.output)
ffts_unimod = ffts.compile(thr)

x = np.arange(size, dtype=np.complex64)
x = thr.to_device(x)
X = thr.array((size,), dtype=np.complex64)
ffts_unimod(X, x)
print(X)


unimod = unimod_gen(size, single=False)
fftd = FFT(thr.array(size, dtype=np.complex128))
fftd.parameter.output.connect(unimod, unimod.input, uni=unimod.output)
fftd_unimod = fftd.compile(thr)
x = np.arange(size, dtype=np.complex128)
x = thr.to_device(x)
X = thr.array((size,), dtype=np.complex128)

예제 #17

0

파일 보기

파일: factories.py 프로젝트: kaizhongkaizhong/WaveSyn

 def create(thr, size, dtype=np.complex128, axes=None, compile_=True):
     fft = FFT(thr.array(size, dtype=dtype), axes)
     if compile_:
         fft = fft.compile(thr)
     return fft

예제 #18

0

파일 보기

파일: test_fft.py 프로젝트: piveloper/pyopencl-extension

def test_fft(in_data_np):
    atol = 1e-4 if in_data_np.dtype == Types.cfloat else 1e-8
    import numpy as np

    in_data_cl = to_device(in_data_np)

    fft_cl = Fft(in_data_cl, emulate=False)

    # zero padding data for numpy
    axis = 1
    N = in_data_np.shape[axis]
    if not np.log2(N).is_integer():  # if not power of 2, pad accordingly
        N = 2**int(np.log2(N) + 1)
    in_data_np_power_of_two = np.zeros((in_data_np.shape[0], N),
                                       in_data_np.dtype)
    in_data_np_power_of_two[:, :in_data_np.shape[axis]] = in_data_np

    def measure(call):
        attempts = 3
        ts = []
        for i in range(attempts):
            t1 = time.time()
            call()
            t2 = time.time()
            ts.append(t2 - t1)
        return min(ts)

    # import pyfftw
    # t_fftw = measure(lambda: pyfftw.interfaces.numpy_fft.fft(in_data_np_power_of_two, axis=-1))
    t_np = measure(lambda: np.fft.fft(in_data_np_power_of_two, axis=-1))
    fft_in_data_np = np.fft.fft(in_data_np_power_of_two, axis=-1)

    def fft_call():
        fft_in_data_cl = fft_cl()
        fft_in_data_cl.queue.finish()

    t_cl = measure(fft_call)
    fft_in_data_cl = fft_cl()

    if in_data_np.size < 1024:
        # Test against emulation (commented since it is slower)
        use_existing_file_for_emulation(False)
        fft_cl_py = Fft(in_data_cl, emulate=True)

        fft_in_data_cl_py = fft_cl_py()
        a = fft_in_data_cl_py.get().view(Types.cdouble)
        b = fft_in_data_cl.get().view(Types.cdouble)
        c = fft_in_data_np.view(Types.cdouble)
        assert np.allclose(a, b)
        assert np.allclose(c, b)
        assert np.allclose(c, a)

    # import matplotlib.pyplot as plt
    # plt.plot(fft_in_data_np.flatten())
    # plt.plot(fft_in_data_cl_emulation.get().flatten())
    # plt.show()
    assert np.allclose(fft_in_data_np, fft_in_data_cl.get(), atol=atol)
    # benchmark using reikna
    if False:  # change to true to run against reikna's fft. Note: Reikna takes quite some optimization time before run
        from reikna.cluda import any_api
        from reikna.fft import FFT
        import numpy
        api = any_api()
        thr = api.Thread.create()
        data = in_data_np
        dtype = data.dtype
        axes = (1, )
        fft = FFT(data, axes=axes)
        fftc = fft.compile(thr)
        data_dev = thr.to_device(data)
        res_dev = thr.empty_like(data_dev)
        ts = []
        for i in range(attempts):
            t1 = time.time()
            fftc(res_dev, data_dev)
            thr.synchronize()
            t2 = time.time()
            ts.append(t2 - t1)
        fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
        tnp = time.time()
        fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype)
        tnp = time.time() - tnp
        # numpy.fft.fftn(data[:, :, 0], axes=(1,))
        treikna_min = min(ts)
        assert np.allclose(fft_in_data_np, res_dev.get())

예제 #19

0

파일 보기

        Parameter('input', Annotation(Type(dtype, size), 'i'))
    ],
                            '''
        ${input.ctype} val = ${input.load_same};       
        ${output.store_same}(${polar_unit}(atan2(val.y, val.x)));
        ''',
                            render_kwds=dict(polar_unit=functions.polar_unit(
                                dtype=np.float32 if single else np.double)))
    return unimod


unimod = unimod_gen(size)

ffts = FFT(thr.array(size, dtype=np.complex64))
ffts.parameter.output.connect(unimod, unimod.input, uni=unimod.output)
ffts_unimod = ffts.compile(thr)

x = np.arange(size, dtype=np.complex64)
x = thr.to_device(x)
X = thr.array((size, ), dtype=np.complex64)
ffts_unimod(X, x)
print(X)

unimod = unimod_gen(size, single=False)
fftd = FFT(thr.array(size, dtype=np.complex128))
fftd.parameter.output.connect(unimod, unimod.input, uni=unimod.output)
fftd_unimod = fftd.compile(thr)
x = np.arange(size, dtype=np.complex128)
x = thr.to_device(x)
X = thr.array((size, ), dtype=np.complex128)
fftd_unimod(X, x)

예제 #20

0

파일 보기

def run_test(thr, shape, dtype, axes=None):

    data = numpy.random.normal(size=shape).astype(dtype)

    fft = FFT(data, axes=axes)
    fftc = fft.compile(thr)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    # FFT + shift as two separate computations

    data_dev = thr.to_device(data)

    t_start = time.time()
    fftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_fft = time.time() - t_start

    t_start = time.time()
    shiftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_shift = time.time() - t_start

    data_dev = thr.to_device(data)

    t_start = time.time()
    fftc(data_dev, data_dev)
    shiftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_separate = time.time() - t_start

    data_gpu = data_dev.get()

    # FFT + shift as a computation with a transformation

    data_dev = thr.to_device(data)

    # a separate output array to avoid unsafety of the shift transformation
    res_dev = thr.empty_like(data_dev)

    shift_tr = fftshift(data, axes=axes)
    fft2 = fft.parameter.output.connect(shift_tr,
                                        shift_tr.input,
                                        new_output=shift_tr.output)
    fft2c = fft2.compile(thr)

    t_start = time.time()
    fft2c(res_dev, data_dev)
    thr.synchronize()
    t_gpu_combined = time.time() - t_start

    # Reference calculation with numpy

    t_start = time.time()
    numpy.fft.fftn(data, axes=axes)
    t_cpu_fft = time.time() - t_start

    t_start = time.time()
    numpy.fft.fftshift(data, axes=axes)
    t_cpu_shift = time.time() - t_start

    t_start = time.time()
    data_ref = numpy.fft.fftn(data, axes=axes)
    data_ref = numpy.fft.fftshift(data_ref, axes=axes)
    t_cpu_all = time.time() - t_start

    data_gpu2 = res_dev.get()

    # Checking that the results are correct
    # (note: this will require relaxing the tolerances
    # if complex64 is used instead of complex128)
    assert numpy.allclose(data_ref, data_gpu)
    assert numpy.allclose(data_ref, data_gpu2)

    return dict(t_gpu_fft=t_gpu_fft,
                t_gpu_shift=t_gpu_shift,
                t_gpu_separate=t_gpu_separate,
                t_gpu_combined=t_gpu_combined,
                t_cpu_fft=t_cpu_fft,
                t_cpu_shift=t_cpu_shift,
                t_cpu_all=t_cpu_all)

예제 #21

0

파일 보기

파일: profile_reikna_opencl.py 프로젝트: maeager/Agilent2Dicom

def kspaceepanechnikov_filter_CL2(ksp, sigma):
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    clear_first_arg_caches()
    fsiz = (5, 5, 5)
    print(np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2, np.ceil(sigma[2]) + 2)
    print sigma
    fsiz = (np.ceil(sigma) + 2).astype(int)
    for i in xrange(0, fsiz.size):
        if not fsiz[i] & 0x1:
            fsiz[i] += 1
    # Create image-domain Epanechikov kernel
    Kepa = epanechnikov_kernel(fsiz, sigma)
    # Place kernel at centre of ksp-sized matrix
    Kfilter = np.zeros(np.array(sz), dtype=np.complex64)
    szmin = np.floor(
        np.array(sz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1)
    szmax = np.floor(szmin + np.array(Kepa.shape))
    print "Epa filter size ", sz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1],
            szmin[2]:szmax[2]].imag = Kepa
    # Create fourier-domain Epanechnikov filter
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(Kfilter)
    rfft = FFT(data_dev)
    crfft = rfft.compile(thr)
    fftshift = FFTShift(data_dev)
    cfftshift = fftshift.compile(thr)
    crfft(data_dev, data_dev)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    Fepanechnikov = np.abs(data_dev.get())  # / np.prod(np.array(ksp.shape))
    #result2 = result2[::-1,::-1,::-1]
    thr.synchronize()
    #result = np.zeros(np.array(siz), dtype=np.complex64)
    #result.real = np.abs(result2) / np.sqrt(2)
    #result.imag = np.abs(result2) / np.sqrt(2)
    del data_dev, rfft, crfft, fftshift, cfftshift
    # Multiply Epanechnikov filter to real and imag ksp data
    program = thr.compile("""
KERNEL void multiply_them(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *a,
    GLOBAL_MEM ${ftype} *f)
{
  const SIZE_T i = get_local_id(0);
  dest[i].x = a[i].x * f[i];
  dest[i].y = a[i].y * f[i];
}""",
                          render_kwds=dict(ctype=dtypes.ctype(dtype),
                                           ftype=dtypes.ctype(ftype)))

    data_dev = thr.to_device(ksp)
    filter_dev = thr.to_device(Fepanechnikov)
    multiply_them = program.multiply_them
    multiply_them(data_dev, data_dev, filter_dev, global_size=512 * 512 * 512)
    thr.synchronize()
    del filter_dev, program
    #api = cluda.ocl_api()
    #api = any_api()
    #thr = api.Thread.create()
    # Filter
    # data_dev = thr.to_device(ksp)
    # ifft = FFT(data_dev)
    FACTOR = 1.0

    # Recon
    # thr.synchronize()
    #data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    fftshiftobj = FFTShift(data_dev)
    cfftshift = fftshiftobj.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()
    result2 = data_dev.get() / np.prod(np.array(ksp.shape))
    result2 = result2[::-1, ::-1, ::-1]
    thr.release()
    return result2

예제 #22

0

파일 보기

파일: profile_reikna_opencl.py 프로젝트: maeager/Agilent2Dicom

def fourierepanechnikov(siz, sigma):
    """
    Epanechnikov kernel in Fourier domain is
     A.(1-|x|^2)  => (3/2*w^3)(sin(w) - w*cos(w)/2)
    """

    # (uu, vv, ww) = fouriercoords(siz)
    # uu = uu + np.spacing(1)
    # vv = vv + np.spacing(1)
    # ww = ww + np.spacing(1)

    # if not hasattr(sigma, "__len__"):
    # #if type(sigma) is float or type(sigma) is numpy.float64:
    #     return ((3.0*sigma/16.0)/(np.pi*(uu + vv +
    #     ww)/(sigma))**3)*(np.sin(2*np.pi*(uu + vv + ww)/(sigma)) - np.pi*(uu
    #     + vv + ww)/(sigma)*np.cos(2*np.pi*(uu + vv + ww)/(sigma))/2)
    # else:
    # return ((3.0/16.0)/(np.pi*((uu**3)/sigma[0]**4 + (vv**3)/sigma[1]**4 +
    # (ww**3)/sigma[2]**4)))*(np.sin(2*np.pi*(uu/sigma[0] + vv/sigma[1] +
    # ww/sigma[2])) - np.pi*(uu/sigma[0] + vv/sigma[1] +
    # ww/sigma[2])*np.cos(2*np.pi*(uu/sigma[0] + vv/sigma[1] + ww/sigma[2])))

    def is_odd(num):
        return num & 0x1

    from cplxfilter import epanechnikov_kernel
    if not hasattr(sigma, "__len__"):
        Kepa = epanechnikov_kernel(
            (np.ceil(sigma) + 1, np.ceil(sigma) + 1, np.ceil(sigma) + 1),
            sigma)
    else:
        print(
            np.ceil(sigma[0]) + 2,
            np.ceil(sigma[1]) + 2,
            np.ceil(sigma[2]) + 2)
        print sigma
        fsiz = (np.ceil(sigma) + 2).astype(int)
        for i in xrange(0, fsiz.size):
            if is_odd(fsiz[i]):
                fsiz[i] += 1

        Kepa = epanechnikov_kernel(
            (np.ceil(sigma[0]) + 2, np.ceil(sigma[1]) + 2,
             np.ceil(sigma[2]) + 2), sigma)

    Kfilter = np.zeros(np.array(siz), dtype=np.complex64)
    szmin = np.floor(
        np.array(siz) / 2.0 - np.floor(np.array(Kepa.shape) / 2.0) - 1)
    szmax = np.floor(szmin + np.array(Kepa.shape))
    print "Epa filter size ", siz, " image filter ", Kepa.shape, " szmin ", szmin, " szmax ", szmax
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1], szmin[2]:szmax[2]] = Kepa
    Kfilter[szmin[0]:szmax[0], szmin[1]:szmax[1],
            szmin[2]:szmax[2]].imag = Kepa

    # return np.abs(fftshift(clfftn(Kfilter)))

    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(Kfilter)
    fft = FFT(data_dev)
    cfft = fft.compile(thr)
    fftshift = FFTShift(data_dev)
    cfftshift = fftshift.compile(thr)
    cfft(data_dev, data_dev)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()

    result2 = data_dev.get()  # / np.prod(np.array(ksp.shape))
    #result2 = result2[::-1,::-1,::-1]
    thr.release()
    result = np.zeros(np.array(siz), dtype=np.complex64)
    result.real = np.abs(result2) / np.sqrt(2)
    result.imag = np.abs(result2) / np.sqrt(2)
    return result

예제 #23

0

파일 보기

파일: profile_reikna_opencl.py 프로젝트: maeager/Agilent2Dicom

def kspacegaussian_filter_CL2(ksp, sigma):
    """ Kspace gaussian filter and recon using GPU OpenCL

    1. GPU intialisation
    2. push KSP complex matrix to GPU
    3. declare FFT program
    4. declare Complex Gaussian GPU filter program
    5. Execute Gaussian GPU program
    6. GPU sync
    7. Execute FFT Recon
    8. Execute FFTshift
    9. Retrieve reconstruced complex image from GPU
    10. Reorganise image to standard (mimic numpy format)

    """
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    ultype = np.uint64
    #api = cluda.ocl_api()
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  // Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x), (double)dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
                          render_kwds=dict(ctype=dtypes.ctype(dtype),
                                           ftype=dtypes.ctype(ftype),
                                           exp=functions.exp(ftype)),
                          fast_math=True)
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])

    thr.synchronize()
    # Recon
    #data_dev = thr.to_device(ksp)
    ifftobj = FFT(data_dev)
    cifft = ifftobj.compile(thr)
    fftshiftobj = FFTShift(data_dev)
    cfftshift = fftshiftobj.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    thr.synchronize()
    cfftshift(data_dev, data_dev)
    thr.synchronize()
    result2 = data_dev.get() / np.prod(np.array(ksp.shape))
    result2 = result2[::-1, ::-1, ::-1]
    thr.release()
    return result2

예제 #24

0

파일 보기

파일: profile_reikna_opencl.py 프로젝트: maeager/Agilent2Dicom

image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args)
toc()

from reikna.cluda import dtypes, any_api
from reikna.fft import FFT
from reikna.core import Annotation, Type, Transformation, Parameter
# create two timers so we can speed-test each approach

api = any_api()
thr = api.Thread.create()
N = 512

tic()
data_dev = thr.to_device(ksp)
ifft = FFT(data_dev)
cifft = ifft.compile(thr)
cifft(data_dev, data_dev, inverse=0)
thr.synchronize()
toc()
result = np.fft.fftshift(data_dev.get() / N**3)
result = result[::-1, ::-1, ::-1]
result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0)
print "Reikna IFFT time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0])))
thr.release()
del ifft, cifft, data_dev, thr

thr = api.Thread.create()
tic()
data_dev = thr.to_device(ksp)
ifft = FFT(data_dev)

예제 #25

0

파일 보기

    print 'PyFFT error: ', np.sum(abs(cpuSol.real - solPyFFT.real)), np.sum(
        abs(cpuSol.imag - solPyFFT.imag))
    print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n'
    #print np.sum(cpuSol.real)

    imem = getFreeMemory(show=False)
    setZero(aux_gpu, block=block3d, grid=grid3d)
    setZero(aux2_gpu, block=block3d, grid=grid3d)
    myplan1 = plan2(aux_gpu.shape, aux_gpu.dtype, aux_gpu.dtype)
    gpuMesureTime(solBySci, ntimes=100)
    solSci = aux2_gpu.get() / float(cpuSol.size)
    print 'SciKits error: ', np.sum(abs(cpuSol.real - solSci.real)), np.sum(
        abs(cpuSol.imag - solSci.imag))
    print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n'

    imem = getFreeMemory(show=False)
    setZero(aux_gpu, block=block3d, grid=grid3d)
    setZero(aux2_gpu, block=block3d, grid=grid3d)
    api = cuda_api()
    thr = api.Thread(ctx)
    fftPlan3 = FFT(func_gpu)
    reikFFT = fftPlan3.compile(thr)
    gpuMesureTime(solByReik, ntimes=100)
    solReik = aux2_gpu.get()
    print 'Reikna error: ', np.sum(abs(cpuSol.real - solReik.real)), np.sum(
        abs(cpuSol.imag - solReik.imag))
    print 'Extra memory use:', imem - getFreeMemory(show=False), 'MB \n'

    #print np.sum(cpuSol.real),np.sum(abs(cpuSol.real))

    ctx.detach()

예제 #26

0

파일 보기

파일: gpu_cache.py 프로젝트: mramospe/minkit

 def build_object(self, arr):
     f = FFT(arr)
     return f.compile(self._thread)

예제 #27

0

파일 보기

파일: profile_reikna_opencl.py 프로젝트: maeager/Agilent2Dicom

def kspacegaussian_filter_CL(ksp, sigma):
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    api = any_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ${ultype} x = (${ultype})get_global_id(0);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ${ultype} idx = x;
  ${ftype} i = (${ftype})((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0f))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  // Account for large global index (stored as ulong) before performing modulus
  double pre_k=fmod((double)(x), (double)dim3);
  ${ftype} k = (${ftype}) pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  // ${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  // ${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
                          render_kwds=dict(ctype=dtypes.ctype(dtype),
                                           ftype=dtypes.ctype(ftype),
                                           ultype=dtypes.ctype(np.uint64),
                                           exp=functions.exp(ftype)),
                          fast_math=True)
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(data_dev, data_dev, global_size=sz[0] * sz[1] * sz[2])

    thr.synchronize()
    ##
    #api = any_api()
    #thr = api.Thread.create()
    #data_dev = thr.to_device(ksp_out)
    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2])
    result = result[::-1, ::-1, ::-1]
    result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0)
    return result  # ,ksp_out

예제 #28

0

파일 보기

파일: demo_fftshift_transformation.py 프로젝트: fjarri/reikna

def run_test(thr, shape, dtype, axes=None):

    data = numpy.random.normal(size=shape).astype(dtype)

    fft = FFT(data, axes=axes)
    fftc = fft.compile(thr)

    shift = FFTShift(data, axes=axes)
    shiftc = shift.compile(thr)

    # FFT + shift as two separate computations

    data_dev = thr.to_device(data)

    t_start = time.time()
    fftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_fft = time.time() - t_start

    t_start = time.time()
    shiftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_shift = time.time() - t_start

    data_dev = thr.to_device(data)

    t_start = time.time()
    fftc(data_dev, data_dev)
    shiftc(data_dev, data_dev)
    thr.synchronize()
    t_gpu_separate = time.time() - t_start

    data_gpu = data_dev.get()

    # FFT + shift as a computation with a transformation

    data_dev = thr.to_device(data)

    # a separate output array to avoid unsafety of the shift transformation
    res_dev = thr.empty_like(data_dev)

    shift_tr = fftshift(data, axes=axes)
    fft2 = fft.parameter.output.connect(shift_tr, shift_tr.input, new_output=shift_tr.output)
    fft2c = fft2.compile(thr)

    t_start = time.time()
    fft2c(res_dev, data_dev)
    thr.synchronize()
    t_gpu_combined = time.time() - t_start

    # Reference calculation with numpy

    t_start = time.time()
    numpy.fft.fftn(data, axes=axes)
    t_cpu_fft = time.time() - t_start

    t_start = time.time()
    numpy.fft.fftshift(data, axes=axes)
    t_cpu_shift = time.time() - t_start

    t_start = time.time()
    data_ref = numpy.fft.fftn(data, axes=axes)
    data_ref = numpy.fft.fftshift(data_ref, axes=axes)
    t_cpu_all = time.time() - t_start

    data_gpu2 = res_dev.get()

    # Checking that the results are correct
    # (note: this will require relaxing the tolerances
    # if complex64 is used instead of complex128)
    assert numpy.allclose(data_ref, data_gpu)
    assert numpy.allclose(data_ref, data_gpu2)

    return dict(
        t_gpu_fft=t_gpu_fft,
        t_gpu_shift=t_gpu_shift,
        t_gpu_separate=t_gpu_separate,
        t_gpu_combined=t_gpu_combined,
        t_cpu_fft=t_cpu_fft,
        t_cpu_shift=t_cpu_shift,
        t_cpu_all=t_cpu_all)

예제 #29

0

파일 보기

        [Parameter('output', Annotation(Type(complex_dtype, arr.shape), 'o')),
        Parameter('input', Annotation(arr, 'i'))],
        """
        ${output.store_same}(
            COMPLEX_CTR(${output.ctype})(
                ${input.load_same},
                0));
        """)


arr = numpy.random.normal(size=3000).astype(numpy.float32)

trf = get_complex_trf(arr)


# Create the FFT computation and attach the transformation above to its input.
fft = FFT(trf.output) # (A shortcut: using the array type saved in the transformation)
fft.parameter.input.connect(trf, trf.output, new_input=trf.input)
cfft = fft.compile(thr)


# Run the computation
arr_dev = thr.to_device(arr)
res_dev = thr.array(arr.shape, numpy.complex64)
cfft(res_dev, arr_dev)
result = res_dev.get()

reference = numpy.fft.fft(arr)

assert numpy.linalg.norm(result - reference) / numpy.linalg.norm(reference) < 1e-6

예제 #30

0

파일 보기

파일: factories.py 프로젝트: xialulee/WaveSyn

 def create(thr, size, dtype=np.complex128, axes=None, compile_=True):
     fft = FFT(thr.array(size, dtype=dtype), axes)
     if compile_:
         fft = fft.compile(thr)
     return fft

예제 #31

0

파일 보기

if __name__ == '__main__':

    api = any_api()
    thr = api.Thread.create()

    dtype = numpy.complex128

    shape = (1024, 16, 16, 16)
    axes = (1, 2, 3)

    data = numpy.random.normal(size=shape) + 1j * numpy.random.normal(size=shape)
    data = data.astype(dtype)

    fft = FFT(data, axes=axes)
    fftc = fft.compile(thr)

    fft2 = FFTWithTranspose(data, axes=axes)
    fft2c = fft2.compile(thr)

    data_dev = thr.to_device(data)
    res_dev = thr.empty_like(data_dev)

    for comp, tag in [(fftc, "original FFT"), (fft2c, "transposition-based FFT")]:
        attempts = 10
        ts = []
        for i in range(attempts):
            t1 = time.time()
            comp(res_dev, data_dev)
            thr.synchronize()
            t2 = time.time()