Пример #1
0
    def test_2d_real_to_complex_double(self, ctx):
        if not has_double(ctx):  #TODO: find better way to skip test
            return
        queue = cl.CommandQueue(ctx)

        M = 64
        N = 32

        nd_data = np.arange(M * N, dtype=np.float64)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)

        cl_data_transformed = cla.zeros(queue, (M, N // 2 + 1),
                                        dtype=np.complex128)

        transform = FFT(
            ctx,
            queue,
            cl_data,
            cl_data_transformed,
            axes=(1, 0),
        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-8,
                           atol=1e-8)
Пример #2
0
    def test_2d_in_4d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)

        L1 = 4
        L2 = 5
        
        M = 64
        N = 32
        axes = (-1, -2) #ok
        #axes = (0,1) #ok
        #axes = (0,2) #cannot be collapsed
        
        nd_data = np.arange(L1*L2*M*N, dtype=np.complex64)
        nd_data.shape = (L1, L2, M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros_like(cl_data)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = axes,
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.fft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft2(nd_data, axes=axes),
                           rtol=1e-3, atol=1e-3)
Пример #3
0
    def test_2d_in_4d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)

        L1 = 4
        L2 = 5

        M = 64
        N = 32
        axes = (-1, -2)  #ok
        #axes = (0,1) #ok
        #axes = (0,2) #cannot be collapsed

        nd_data = np.arange(L1 * L2 * M * N, dtype=np.complex64)
        nd_data.shape = (L1, L2, M, N)
        cl_data = cla.to_device(queue, nd_data)

        cl_data_transformed = cla.zeros_like(cl_data)

        transform = FFT(
            ctx,
            queue,
            cl_data,
            cl_data_transformed,
            axes=axes,
        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.fft2(nd_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft2(nd_data, axes=axes),
                           rtol=1e-3,
                           atol=1e-3)
Пример #4
0
    def test_2d_real_to_complex_double(self, ctx):
        if not has_double(ctx): #TODO: find better way to skip test
            return
        queue = cl.CommandQueue(ctx)
        
        M = 64
        N = 32

        nd_data = np.arange(M*N, dtype=np.float64)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex128)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = (1,0),
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-8, atol=1e-8)
Пример #5
0
    def __init__(self, decomp, context, queue, grid_shape, dtype):
        self.decomp = decomp
        self.grid_shape = grid_shape
        self.dtype = np.dtype(dtype)
        self.is_real = is_real = self.dtype.kind == "f"

        from pystella.fourier import get_complex_dtype_with_matching_prec
        self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype)
        from pystella.fourier import get_real_dtype_with_matching_prec
        self.rdtype = get_real_dtype_with_matching_prec(self.dtype)

        self.fx = cla.zeros(queue, grid_shape, dtype)
        self.fk = cla.zeros(queue, self.shape(is_real), cdtype)
        from gpyfft import FFT
        self.forward = FFT(context,
                           queue,
                           self.fx,
                           out_array=self.fk,
                           real=is_real,
                           scale_forward=1,
                           scale_backward=1)
        self.backward = FFT(context,
                            queue,
                            self.fk,
                            out_array=self.fx,
                            real=is_real,
                            scale_forward=1,
                            scale_backward=1)

        slc = (
            (),
            (),
            (),
        )
        self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue)
Пример #6
0
    def test_2d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)

        L = 4
        M = 64
        N = 32
        axes = (-1, -2)

        nd_data = np.arange(L * M * N, dtype=np.complex64)
        nd_data.shape = (L, M, N)
        cl_data = cla.to_device(queue, nd_data)

        cl_data_transformed = cla.zeros_like(cl_data)

        transform = FFT(
            ctx,
            queue,
            cl_data,
            cl_data_transformed,
            axes=axes,
        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.fft2(nd_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft2(nd_data, axes=axes),
                           rtol=1e-3,
                           atol=1e-3)
Пример #7
0
    def test_2d_real_to_complex(self, ctx):
        queue = cl.CommandQueue(ctx)

        M = 64
        N = 32

        nd_data = np.arange(M * N, dtype=np.float32)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)

        cl_data_transformed = cla.zeros(queue, (M, N // 2 + 1),
                                        dtype=np.complex64)

        transform = FFT(
            ctx,
            queue,
            cl_data,
            cl_data_transformed,
            axes=(1, 0),
        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-3,
                           atol=1e-3)
Пример #8
0
    def test_2d_real_to_complex(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        M = 64
        N = 32

        nd_data = np.arange(M*N, dtype=np.float32)
        nd_data.shape = (M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex64)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = (1,0),
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.rfft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft2(nd_data),
                           rtol=1e-3, atol=1e-3)
Пример #9
0
    def test_2d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)

        L = 4
        M = 64
        N = 32
        axes = (-1, -2)
        
        nd_data = np.arange(L*M*N, dtype=np.complex64)
        nd_data.shape = (L, M, N)
        cl_data = cla.to_device(queue, nd_data)
        
        cl_data_transformed = cla.zeros_like(cl_data)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
                        axes = axes,
                        )

        transform.enqueue()

        print(cl_data_transformed.get)
        print(np.fft.fft2(nd_data))
        
        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft2(nd_data, axes=axes),
                           rtol=1e-3, atol=1e-3)
Пример #10
0
    def test_1d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)

        nd_data = np.arange(32, dtype=np.complex64)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.zeros_like(cl_data)

        transform = FFT(ctx, queue, cl_data, cl_data_transformed)
        transform.enqueue()

        assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
Пример #11
0
    def test_1d_inplace_double(self, ctx):
        if not has_double(ctx):  #TODO: find better way to skip test
            return
        queue = cl.CommandQueue(ctx)

        nd_data = np.arange(32, dtype=np.complex128)
        cl_data = cla.to_device(queue, nd_data)

        transform = FFT(ctx, queue, cl_data)
        transform.enqueue()

        assert np.allclose(cl_data.get(), np.fft.fft(nd_data))
Пример #12
0
    def test_1d_inplace_double(self, ctx):
        if not has_double(ctx): #TODO: find better way to skip test
            return
        queue = cl.CommandQueue(ctx)
        
        nd_data = np.arange(32, dtype=np.complex128)
        cl_data = cla.to_device(queue, nd_data)
        
        transform = FFT(ctx, queue,
                        cl_data)
        transform.enqueue()

        assert np.allclose(cl_data.get(),
                           np.fft.fft(nd_data))
Пример #13
0
    def test_1d_out_of_place(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        nd_data = np.arange(32, dtype=np.complex64)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.zeros_like(cl_data)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed
        )
        transform.enqueue()

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fft(nd_data))
Пример #14
0
    def test_1d_real_to_complex(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        N = 32

        nd_data = np.arange(N, dtype=np.float32)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.zeros(queue, (N//2+1,), dtype = np.complex64)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
        )
        transform.enqueue()

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft(nd_data))
Пример #15
0
    def test_1d_real_to_complex(self, ctx):
        queue = cl.CommandQueue(ctx)
        
        N = 32

        nd_data = np.arange(N, dtype=np.float32)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.zeros(queue, (N//2+1,), dtype = np.complex64)
        
        transform = FFT(ctx, queue,
                        cl_data,
                        cl_data_transformed,
        )
        transform.enqueue()

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.rfft(nd_data))
Пример #16
0
def run(double_precision=False):
    context = cl.create_some_context()
    queue = cl.CommandQueue(context)

    dtype = np.complex64 if not double_precision else np.complex128

    n_run = 100  #set to 1 for testing for correct result

    if n_run > 1:
        nd_dataC = np.random.normal(size=(1024, 1024)).astype(dtype)
    else:
        nd_dataC = np.ones((1024, 1024), dtype=dtype)  #set n_run to 1

    nd_dataF = np.asfortranarray(nd_dataC)
    dataC = cla.to_device(queue, nd_dataC)
    dataF = cla.to_device(queue, nd_dataF)

    nd_result = np.zeros_like(nd_dataC, dtype=dtype)
    resultC = cla.to_device(queue, nd_result)
    resultF = cla.to_device(queue, np.asfortranarray(nd_result))
    result = resultF

    axes_list = [(-2, -1), (-1, -2), None]  #batched 2d transforms

    if True:
        print('out of place transforms', dataC.shape, dataC.dtype)
        print('axes         in out')
        for axes in axes_list:
            for data in (dataC, dataF):
                for result in (resultC, resultF):
                    t_ms, gflops = 0, 0
                    try:

                        transform = FFT(context,
                                        queue,
                                        data,
                                        result,
                                        axes=axes)
                        #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C)
                        print(
                            '%-10s %3s %3s' % (
                                axes,
                                'C' if data.flags.c_contiguous else 'F',
                                'C' if result.flags.c_contiguous else 'F',
                            ),
                            end=' ',
                        )

                        tic = timeit.default_timer()
                        for i in range(n_run):
                            events = transform.enqueue()
                            #events = transform.enqueue(False)
                        for e in events:
                            e.wait()
                        toc = timeit.default_timer()
                        t_ms = 1e3 * (toc - tic) / n_run
                        gflops = 5e-9 * np.log2(np.prod(
                            transform.t_shape)) * np.prod(
                                transform.t_shape) * transform.batchsize / (
                                    1e-3 * t_ms)

                        npfft_result = npfftn(nd_dataC, axes=axes)
                        if transform.plan.transpose_result:
                            npfft_result = np.swapaxes(npfft_result, axes[0],
                                                       axes[1])
                        max_error = np.max(abs(result.get() - npfft_result))
                        print('%8.1e' % max_error, end=' ')
                        assert_allclose(
                            result.get(),
                            npfft_result,
                            atol=1e-8 if double_precision else 1e-3,
                            rtol=1e-8 if double_precision else 1e-3)

                        #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)),
                        #                          1e-4)

                    except GpyFFT_Error as e:
                        print(e)
                    except AssertionError as e:
                        print(e)
                    except Exception as e:
                        print(e)
                    finally:
                        print('%5.2fms %6.2f Gflops' % (t_ms, gflops))

        print('in place transforms', nd_dataC.shape, nd_dataC.dtype)

    for axes in axes_list:
        for nd_data in (nd_dataC, nd_dataF):
            data = cla.to_device(queue, nd_data)
            transform = FFT(context, queue, data, axes=axes)
            #transform.plan.transpose_result = True #not implemented
            tic = timeit.default_timer()
            for i in range(n_run):  # inplace transform fails for n_run > 1
                events = transform.enqueue()
            for e in events:
                e.wait()
            toc = timeit.default_timer()
            t_ms = 1e3 * (toc - tic) / n_run
            gflops = 5e-9 * np.log2(np.prod(transform.t_shape)) * np.prod(
                transform.t_shape) * transform.batchsize / (1e-3 * t_ms)
            print(
                '%-10s %3s %5.2fms %6.2f Gflops' %
                (axes, 'C' if data.flags.c_contiguous else 'F', t_ms, gflops))
Пример #17
0
    def create_workspace(self):
        """
        init
        ... x_{k+1} = ref
        ... a_k = 0.

        iter 0
        ... h_k = None
            y_k = x_{k+1}
            y_k = (a_k + 1) * x_{k+1} - a_k * x_k
        ... t = f(x_k)
        ... g_k = t - y_k
            x_k = x_{k+1}
            x_{k+1} = y_k + g_k
        ... g_{k-2} = g_{k-1}
            g_{k-1} = g_k
        ... a_k = 0.

        iter 1
        ... h_k = x_{k+1} - x_k
            y_k = x_{k+1}
        ... t = f(x_{k+1})
        ... g_k = t - y_k
            x_k = x_{k+1}
            x_{k+1} = y_k + g_k
        ... g_{k-2} = g_{k-1}
            g_{k-1} = g_k
        ... a_k = L(g_{k-1}, g_{k-2})

        iter 2
        ... h_k = x_{k+1} - x_k
            y_k = x_{k+1} + a_k * h_k
        ... t = f(x_{k+1})
        ... g_k = t - y_k
            x_k = x_{k+1}
            x_{k+1} = y_k + g_k
        ... g_{k-2} = g_{k-1}
            g_{k-1} = g_k
        ... a_k = L(g_{k-1}, g_{k-2})

        iter n (y_k, x_{k+1}, x_k, g_{k+1}, g_k)
        ... y_k = (a_k + 1) * x_{k+1} - a_k * x_k
        ... t = f(x_{k+1})
        ... g_k = g_{k+1}
            g_{k+1} = t - y_k
        ... x_k = x_{k+1}
            x_{k+1} = y_k + g_{k+1}
        ... a_k = L(g_{k+1}, g_k)
        --> return x_{k+1}
        --> return t, bypass acceleration
        """
        # pre-calculate shapes
        nz, ny, nx = self._out_shape
        real_shape = (nz, ny, nx)
        complex_shape = (nz, ny, nx // 2 + 1)

        # create memory pool
        allocator = cl.tools.ImmediateAllocator(
            self.queue, mem_flags=cl.mem_flags.READ_WRITE)
        self._mem_pool = cl.tools.MemoryPool(allocator)

        #TODO wrap this section in ExitStack, callback(destroy_workspace)

        # reference image
        self.h_buf = np.empty(real_shape, dtype=np.float32)
        self.d_ref = cl.array.empty(self.queue,
                                    real_shape,
                                    np.float32,
                                    allocator=self._mem_pool)

        # otf
        self.d_otf = cl.array.empty(self.queue,
                                    complex_shape,
                                    np.complex64,
                                    allocator=self._mem_pool)

        # deconvolution io buffers
        self.d_dec_bufs = AttrDict()
        self.d_dec_bufs['tmp'] = cl.array.empty(self.queue,
                                                real_shape,
                                                np.float32,
                                                allocator=self._mem_pool)
        self.d_dec_bufs['fft'] = cl.array.empty(self.queue,
                                                complex_shape,
                                                np.complex64,
                                                allocator=self._mem_pool)

        # deconvolution fft/ifft plans
        self.fft = FFT(self.context,
                       self.queue,
                       self.d_dec_bufs.tmp,
                       out_array=self.d_dec_bufs.fft)
        logger.debug("fft buffer size: {}".format(
            format_byte_size(self.fft.plan.temp_array_size, binary=True)))

        self.ifft = FFT(self.context,
                        self.queue,
                        self.d_dec_bufs.fft,
                        out_array=self.d_dec_bufs.tmp,
                        real=True)
        logger.debug("ifft buffer size: {}".format(
            format_byte_size(self.ifft.plan.temp_array_size, binary=True)))

        # accelerator buffers
        self.d_acc_bufs = AttrDict()
        for name in ('y', 'x1', 'x0', 'g1', 'g0'):
            self.d_acc_bufs[name] = cl.array.empty(self.queue,
                                                   real_shape,
                                                   np.float32,
                                                   allocator=self._mem_pool)

        logger.debug("held={}, active={}".format(self._mem_pool.held_blocks,
                                                 self._mem_pool.active_blocks))
Пример #18
0
class gDFT(BaseDFT):
    """
    A wrapper to :mod:`gpyfft` to compute Fast Fourier transforms with
    :mod:`clfft`.

    See :class:`pystella.fourier.dft.BaseDFT`.

    :arg decomp: A :class:`pystella.DomainDecomposition`.

    :arg context: A :class:`pyopencl.Context`.

    :arg queue: A :class:`pyopencl.CommandQueue`.

    :arg grid_shape: A 3-:class:`tuple` specifying the shape of position-space
        arrays to be transformed.

    :arg dtype: The datatype of position-space arrays to be transformed.
        The complex datatype for momentum-space arrays is chosen to have
        the same precision.

    .. versionchanged:: 2020.1

        Support for complex-to-complex transforms.
    """
    def __init__(self, decomp, context, queue, grid_shape, dtype):
        self.decomp = decomp
        self.grid_shape = grid_shape
        self.dtype = np.dtype(dtype)
        self.is_real = is_real = self.dtype.kind == "f"

        from pystella.fourier import get_complex_dtype_with_matching_prec
        self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype)
        from pystella.fourier import get_real_dtype_with_matching_prec
        self.rdtype = get_real_dtype_with_matching_prec(self.dtype)

        self.fx = cla.zeros(queue, grid_shape, dtype)
        self.fk = cla.zeros(queue, self.shape(is_real), cdtype)
        from gpyfft import FFT
        self.forward = FFT(context,
                           queue,
                           self.fx,
                           out_array=self.fk,
                           real=is_real,
                           scale_forward=1,
                           scale_backward=1)
        self.backward = FFT(context,
                            queue,
                            self.fk,
                            out_array=self.fx,
                            real=is_real,
                            scale_forward=1,
                            scale_backward=1)

        slc = (
            (),
            (),
            (),
        )
        self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue)

    @property
    def proc_permutation(self):
        return tuple(range(len(self.grid_shape)))

    def shape(self, forward_output=True):
        if forward_output and self.is_real:
            shape = list(self.grid_shape)
            shape[-1] = shape[-1] // 2 + 1
            return tuple(shape)
        else:
            return self.grid_shape

    def forward_transform(self, fx, fk, **kwargs):
        event, = self.forward.enqueue_arrays(data=fx, result=fk, forward=True)
        fx.add_event(event)
        fk.add_event(event)
        return fk

    def backward_transform(self, fk, fx, **kwargs):
        event, = self.backward.enqueue_arrays(data=fk,
                                              result=fx,
                                              forward=False)
        fx.add_event(event)
        fk.add_event(event)
        return fx
Пример #19
0
def run(double_precision=False):
    context = cl.create_some_context()
    queue = cl.CommandQueue(context)

    dtype = np.complex64 if not double_precision else np.complex128
    
    n_run = 100 #set to 1 for proper testing

    if n_run > 1:
        nd_dataC = np.random.normal(size=(4,1024, 1024)).astype(dtype) #faster than 1024x1024?
    else:
        nd_dataC = np.ones((4,1024, 1024), dtype = dtype) #set n_run to 1

    nd_dataF = np.asfortranarray(nd_dataC)
    dataC = cla.to_device(queue, nd_dataC)
    dataF = cla.to_device(queue, nd_dataF)

    nd_result = np.zeros_like(nd_dataC, dtype = dtype)
    resultC = cla.to_device(queue, nd_result)
    resultF = cla.to_device(queue, np.asfortranarray(nd_result))
    result = resultF

    axes_list = [(1,2), (2,1)] #batched 2d transforms

    if True:
        print('out of place transforms', dataC.shape, dataC.dtype)
        print('axes         in out')
        for axes in axes_list:
            for data in (dataC,
                         dataF):
                for result in (resultC,
                               resultF):
                    try:

                        transform = FFT(context, queue, data, result, axes = axes)
                        #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C)
                        print('%-10s %3s %3s'
                               % (
                                   axes,
                                   'C' if data.flags.c_contiguous else 'F',
                                   'C' if result.flags.c_contiguous else 'F',
                               ),
                              end=' ',
                        )
                        
                        tic = timeit.default_timer()
                        for i in range(n_run):
                            events = transform.enqueue()
                            #events = transform.enqueue(False)
                        for e in events:
                            e.wait()
                        toc = timeit.default_timer()
                        t_ms = 1e3*(toc-tic)/n_run
                        gflops = 5e-9 * np.log2(np.prod(transform.t_shape))*np.prod(transform.t_shape) * transform.batchsize / (1e-3*t_ms)

                        npfft_result = npfftn(nd_dataC, axes = axes)
                        if transform.plan.transpose_result:
                            npfft_result = np.swapaxes(npfft_result, axes[0], axes[1])
                        max_error = np.max(abs(result.get() - npfft_result))
                        print('%8.1e'%max_error, end=' ')
                        assert_allclose(result.get(), npfft_result,
                                        atol = 1e-8 if double_precision else 1e-3,
                                        rtol = 1e-8 if double_precision else 1e-3)
                        
                        #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)),
                        #                          1e-4)

   
                    except GpyFFT_Error as e:
                        print(e)
                        t_ms, gflops = 0, 0
                    except AssertionError as e:
                        print(e)
                    finally:
                        print('%5.2fms %6.2f Gflops' % (t_ms, gflops) )

        print('in place transforms', nd_dataC.shape, nd_dataC.dtype)

    for axes in axes_list:
        for nd_data in (nd_dataC, nd_dataF):
            data = cla.to_device(queue, nd_data)
            transform = FFT(context, queue, data, axes = axes)
            #transform.plan.transpose_result = True #not implemented
            tic = timeit.default_timer()
            for i in range(n_run):  # inplace transform fails for n_run > 1
                events = transform.enqueue()
            for e in events:
                    e.wait()
            toc = timeit.default_timer()
            t_ms = 1e3*(toc-tic)/n_run
            gflops = 5e-9 * np.log2(np.prod(transform.t_shape))*np.prod(transform.t_shape) * transform.batchsize / (1e-3*t_ms)
            print('%-10s %3s %5.2fms %6.2f Gflops' % (
                axes,
                'C' if data.flags.c_contiguous else 'F',
                t_ms, gflops
                ))