Python Memcpy3D 예제들, pycuda.driver.Memcpy3D Python 예제들

예제 #1

0

파일 보기

파일: cuda.py 프로젝트: kotenev/SuRVoS2

def to_tex3d(data):
    """
    Source: https://wiki.tiker.net/PyCUDA/Examples/Demo3DSurface
    """
    d, h, w = data.shape
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(data.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if isinstance(data, gpuarray.GPUArray):
        data = data.get()

    device_array = cuda.Array(descr)
    copy = cuda.Memcpy3D()
    copy.set_src_host(data)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = data.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    return device_array

예제 #2

0

파일 보기

def create_3d_texture(a, module, variable, point_sampling=False):
    
    a = numpy.asfortranarray(a)
    w, h, d = a.shape
    
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(a.dtype)
    descr.num_channels = 1
    descr.flags = 0
    ary = cuda.Array(descr)
   
    copy = cuda.Memcpy3D()
    copy.set_src_host(a)
    copy.set_dst_array(ary)
    copy.width_in_bytes = copy.src_pitch = a.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    out_texref = module.get_texref(variable)
    out_texref.set_array(ary)
    if point_sampling: 
        out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: 
        out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref

예제 #3

0

파일 보기

파일: cuda.py 프로젝트: stylekilla/synctools

    def copyTexture(self, data, pixelSize=None, extent=None, isocenter=None):
        # Convert data to float32 array in Contiguous ordering.
        self.arrIn = np.array(data, dtype=np.float32, order='C')
        d, h, w = self.arrIn.shape

        descr = cuda.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = cuda.dtype_to_array_format(self.arrIn.dtype)
        descr.num_channels = 1
        # descr.flags = 0
        self.gpuTexture = cuda.Array(descr)

        # Copy array data across. This puts a 3D array in linear memory on the GPU.
        copy = cuda.Memcpy3D()
        copy.set_src_host(self.arrIn)
        copy.set_dst_array(self.gpuTexture)
        copy.src_pitch = self.arrIn.strides[1]
        copy.width_in_bytes = self.arrIn.strides[1]
        copy.height = h
        copy.depth = d
        copy.src_height = h
        copy()

        self.pixelSize = pixelSize
        self.isocenter = isocenter
        # Extent[l,r,b,t,f,b]
        self.extent = extent
        # Trigger for setting bottom left corner as 0,0,0.
        self.zeroExtent = False

예제 #4

0

파일 보기

파일: cuda_interface_functions.py 프로젝트: srodney/pyDIA

def numpy3d_to_array(np_array, allow_surface_bind=False, layered=True):
    d, h, w = np_array.shape

    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST

    if layered:
        descr.flags = cuda.array3d_flags.ARRAY3D_LAYERED

    device_array = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array

예제 #5

0

파일 보기

파일: cudaTools.py 프로젝트: bvillasen/Scan_gpu

def gpuArray3DtocudaArray(gpuArray, allowSurfaceBind=False, precision='float'):
    #import pycuda.autoinit
    d, h, w = gpuArray.shape
    descr3D = cuda.ArrayDescriptor3D()
    descr3D.width = w
    descr3D.height = h
    descr3D.depth = d
    if precision == 'float':
        descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype)
        descr3D.num_channels = 1
    elif precision == 'double':
        descr3D.format = cuda.array_format.SIGNED_INT32
        descr3D.num_channels = 2
    else:
        print("ERROR:  CUDA_ARRAY incompatible precision")
        sys.exit()
    descr3D.flags = 0
    if allowSurfaceBind:
        descr3D.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr3D)
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_device(gpuArray.ptr)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1]
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D

예제 #6

0

파일 보기

파일: gpuda.py 프로젝트: myousefi2016/gpuda

    def _copy_local_to_global(self,
                              local_array,
                              global_array,
                              dtype=np.float64):

        nz, ny, nx = self.local_dims
        sw = self.stencil_width

        typesize = global_array.dtype.itemsize

        copier = cuda.Memcpy3D()
        copier.set_src_device(local_array.gpudata)
        copier.set_dst_device(global_array.gpudata)

        # offsets
        copier.src_x_in_bytes = sw * typesize
        copier.src_y = sw
        copier.src_z = sw

        copier.src_pitch = local_array.strides[1]
        copier.dst_pitch = global_array.strides[1]
        copier.src_height = ny + 2 * sw
        copier.dst_height = ny

        copier.width_in_bytes = nx * typesize
        copier.height = ny
        copier.depth = nz

        copier()

예제 #7

0

파일 보기

def numpy3d_to_array(np_array, allow_surface_bind=True):

    import pycuda.autoinit

    d, h, w = np_array.shape

    descr = drv.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = drv.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = drv.array3d_flags.SURFACE_LDST

    device_array = drv.Array(descr)

    copy = drv.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array

예제 #8

0

파일 보기

def array_to_numpy3d(cuda_array):

    import pycuda.autoinit

    descriptor = cuda_array.get_descriptor_3d()

    w = descriptor.width
    h = descriptor.height
    d = descriptor.depth

    shape = d, h, w

    dtype = array_format_to_dtype(descriptor.format)

    numpy_array = np.zeros(shape, dtype)

    copy = drv.Memcpy3D()
    copy.set_src_array(cuda_array)
    copy.set_dst_host(numpy_array)

    itemsize = numpy_array.dtype.itemsize

    copy.width_in_bytes = copy.dst_pitch = w * itemsize
    copy.dst_height = copy.height = h
    copy.depth = d

    copy()

    return numpy_array

예제 #9

0

파일 보기

파일: vis.py 프로젝트: HERA-Team/hera_gpu

def numpy3d_to_array(np_array):
    '''Copy a 3D (d,h,w) numpy array into a 3D pycuda array that can be used 
    to set a texture.  (For some reason, gpuarrays can't be used to do that 
    directly).  A transpose happens implicitly; the CUDA array has dim (w,h,d).'''
    import pycuda.autoinit
    d, h, w = np_array.shape
    descr = driver.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    if np_array.dtype == np.float64:
        descr.format = driver.array_format.SIGNED_INT32
        descr.num_channels = 2
    else:
        descr.format = driver.dtype_to_array_format(np_array.dtype)
        descr.num_channels = 1
    descr.flags = 0
    device_array = driver.Array(descr)
    copy = driver.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()
    return device_array

예제 #10

0

파일 보기

파일: morphology.py 프로젝트: NBStephens/quanfima

def numpy3d_to_array(np_array, allow_surface_bind=True):
    """Converts 3D numpy array to 3D device array.
    """
    # numpy3d_to_array
    # taken from pycuda mailing list (striped for C ordering only)

    d, h, w = np_array.shape

    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST

    device_array = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array

예제 #11

0

파일 보기

파일: cuda_functions.py 프로젝트: charbelias24/3dMapSLAM

def copy3D_device_to_numpy(dst, src, type_sz, width, height, depth):
    copy = cuda.Memcpy3D()
    copy.set_src_device(src)
    copy.set_dst_host(dst)
    copy.height = height
    copy.depth = depth
    copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width * type_sz
    copy()

예제 #12

0

파일 보기

파일: test_driver.py 프로젝트: thecobb/PyCUDA

    def test_3d_texture(self):
        # adapted from code by Nicolas Pinto
        w = 2
        h = 4
        d = 8
        shape = (w, h, d)

        a = numpy.asarray(numpy.random.randn(*shape),
                          dtype=numpy.float32,
                          order="F")

        descr = drv.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = drv.dtype_to_array_format(a.dtype)
        descr.num_channels = 1
        descr.flags = 0

        ary = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_host(a)
        copy.set_dst_array(ary)
        copy.width_in_bytes = copy.src_pitch = a.strides[1]
        copy.src_height = copy.height = h
        copy.depth = d

        copy()

        mod = SourceModule("""
        texture<float, 3, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int x = threadIdx.x;
          int y = threadIdx.y;
          int z = threadIdx.z;
          int dx = blockDim.x;
          int dy = blockDim.y;
          int i = (z*dy + y)*dx + x;
          dest[i] = tex3D(mtx_tex, x, y, z);
          //dest[i] = x;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        mtx_tex.set_array(ary)

        dest = numpy.zeros(shape, dtype=numpy.float32, order="F")
        copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
        assert la.norm(dest - a) == 0

예제 #13

0

파일 보기

def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez):
    print 'rank= %d, (%d, %d, %d)' % (rank, nx, ny, nz),
    total_bytes = nx * ny * nz * 4 * 9
    if total_bytes / (1024**3) == 0:
        print '%d MB' % (total_bytes / (1024**2))
    else:
        print '%1.2f GB' % (float(total_bytes) / (1024**3))

    if nz % Dx != 0:
        print "Error: nz is not multiple of %d" % (Dx)
        sys.exit()
    if (nx * ny) % Dy != 0:
        print "Error: nx*ny is not multiple of %d" % (Dy)
        sys.exit()

    f = np.zeros((nx, ny, nz), 'f')
    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    descr = cuda.ArrayDescriptor3D()
    descr.width = nz
    descr.height = ny
    descr.depth = nx
    descr.format = cuda.dtype_to_array_format(f.dtype)
    descr.num_channels = 1
    descr.flags = 0
    tcex_gpu = cuda.Array(descr)
    tcey_gpu = cuda.Array(descr)
    tcez_gpu = cuda.Array(descr)

    mcopy = cuda.Memcpy3D()
    mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
    mcopy.src_height = mcopy.height = ny
    mcopy.depth = nx

    mcopy.set_src_host(cex)
    mcopy.set_dst_array(tcex_gpu)
    mcopy()
    mcopy.set_src_host(cey)
    mcopy.set_dst_array(tcey_gpu)
    mcopy()
    mcopy.set_src_host(cez)
    mcopy.set_dst_array(tcez_gpu)
    mcopy()

    eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu]
    tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu]

    return eh_fields, tex_fields

예제 #14

0

파일 보기

파일: FastRendererCUDA.py 프로젝트: yyyy3531614/self-augmented-net

def texCubeToGPU(texCube):
    descr = cuda.ArrayDescriptor3D()
    descr.width = texCube.shape[2]
    descr.height = texCube.shape[1]
    descr.depth = 6
    descr.format = cuda.dtype_to_array_format(texCube.dtype)
    descr.num_channels = 4
    descr.flags = cuda.array3d_flags.CUBEMAP

    texCubeArray = cuda.Array(descr)
    copy = cuda.Memcpy3D()
    copy.set_src_host(texCube)
    copy.set_dst_array(texCubeArray)
    copy.width_in_bytes = copy.src_pitch = texCube.strides[1]  #d*h*w*c
    copy.src_height = copy.height = texCube.shape[1]
    copy.depth = 6

    copy()

    return texCubeArray

예제 #15

0

파일 보기

파일: gpuda.py 프로젝트: myousefi2016/gpuda

    def _copy_halo_to_array(self,
                            halo,
                            array,
                            copy_dims,
                            copy_offsets,
                            dtype=np.float64):

        # copy from 2-d halo to 3-d array
        #
        # Parameters:
        # halo, array:  gpuarrays involved in the copy
        # copy_dims: number of elements to copy in (z, y, x) directions
        # copy_offsets: offsets at the destination in (z, y, x) directions

        nz, ny, nx = self.local_dims
        sw = self.stencil_width
        d, h, w = copy_dims
        z_offs, y_offs, x_offs = copy_offsets

        typesize = array.dtype.itemsize

        copier = cuda.Memcpy3D()
        copier.set_src_device(halo.gpudata)
        copier.set_dst_device(array.gpudata)

        # this time, offsets are at the destination:
        copier.dst_x_in_bytes = x_offs * typesize
        copier.dst_y = y_offs
        copier.dst_z = z_offs

        copier.src_pitch = halo.strides[1]
        copier.dst_pitch = array.strides[1]
        copier.src_height = h
        copier.dst_height = ny + 2 * sw

        copier.width_in_bytes = w * typesize
        copier.height = h
        copier.depth = d

        # perform the copy:
        copier()

예제 #16

0

파일 보기

파일: cudaTools.py 프로젝트: bvillasen/Scan_gpu

def np3DtoCudaArray(npArray, allowSurfaceBind=False):
    #import pycuda.autoinit
    d, h, w = npArray.shape
    descr3D = cuda.ArrayDescriptor3D()
    descr3D.width = w
    descr3D.height = h
    descr3D.depth = d
    descr3D.format = cuda.dtype_to_array_format(npArray.dtype)
    descr3D.num_channels = 1
    descr3D.flags = 0
    if allowSurfaceBind:
        descr3D.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr3D)
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1]
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D

예제 #17

0

파일 보기

파일: FFT_solver.py 프로젝트: Landau1908/PyPIC

def get_Memcpy3D_d2d(src, dst, src_pitch, dst_pitch, dim_args, itemsize,
                     src_height, dst_height):
    ''' Wrapper for the pycuda.driver.Memcpy3d() function (same args)
    Returns a callable object which copies the arrays on invocation of ()
    dim_args: list, [width, height, depth] !not width_in_bytes
    '''
    depth, height, width = dim_args
    width_in_bytes = width * itemsize
    src_ptr = getattr(src, 'gpudata', 0) # set to NULL if no valid ptr
    dst_ptr = getattr(dst, 'gpudata', 0) # set to NULL if no valid ptr
    cpy = drv.Memcpy3D()
    cpy.set_src_device(src_ptr)
    cpy.set_dst_device(dst_ptr)
    cpy.height = np.int64(height)
    cpy.width_in_bytes = np.int64(width_in_bytes)
    cpy.depth = np.int64(depth)
    cpy.src_pitch = src_pitch
    cpy.dst_pitch = dst_pitch
    cpy.src_height = np.int64(src_height)
    cpy.dst_height = np.int64(dst_height)
    return cpy

예제 #18

0

파일 보기

    def _prepare_F_texture(self):

        descr = drv.ArrayDescriptor3D()
        descr.width = self.side
        descr.height = self.side
        descr.depth = self.side
        descr.format = drv.dtype_to_array_format(self.F_gpu.dtype)
        descr.num_channels = 1
        descr.flags = 0

        F_array = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_device(self.F_gpu.gpudata)
        copy.set_dst_array(F_array)
        copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1]
        copy.src_height = copy.height = self.side
        copy.depth = self.side

        self.F_gpu_to_array_copy = copy
        self.F_gpu_to_array_copy()
        self.F_texture.set_array(F_array)

예제 #19

0

파일 보기

def gpu3D(src):
    """
    """
    w, h, d = src.shape
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(src.dtype)
    descr.num_channels = 1
    descr.flags = 0
    dst = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(src)
    copy.set_dst_array(dst)
    copy.width_in_bytes = copy.src_pitch = src.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    return dst

예제 #20

0

파일 보기

파일: gpuda.py 프로젝트: myousefi2016/gpuda

    def _copy_array_to_halo(self,
                            array,
                            halo,
                            copy_dims,
                            copy_offsets,
                            dtype=np.float64):

        # copy from 3-d array to 2-d halo
        #
        # Paramters:
        # array, halo:  gpuarrays involved in the copy.
        # copy_dims: number of elements to copy in (z, y, x) directions
        # copy_offsets: offsets at the source in (z, y, x) directions

        nz, ny, nx = self.local_dims
        d, h, w = copy_dims
        z_offs, y_offs, x_offs = copy_offsets

        typesize = array.dtype.itemsize

        copier = cuda.Memcpy3D()
        copier.set_src_device(array.gpudata)
        copier.set_dst_device(halo.gpudata)

        copier.src_x_in_bytes = x_offs * typesize
        copier.src_y = y_offs
        copier.src_z = z_offs

        copier.src_pitch = array.strides[1]
        copier.dst_pitch = halo.strides[1]
        copier.src_height = ny
        copier.dst_height = h

        copier.width_in_bytes = w * typesize
        copier.height = h
        copier.depth = d

        # perform the copy:
        copier()

예제 #21

0

파일 보기

    def alloc_coeff_arrays(s):
        f = np.zeros((s.nx, s.ny, s.nz), 'f')
        s.cex = np.ones_like(f) * 0.5
        s.cex[:, -1, :] = 0
        s.cex[:, :, -1] = 0
        s.cey = np.ones_like(f) * 0.5
        s.cey[:, :, -1] = 0
        s.cey[-1, :, :] = 0
        s.cez = np.ones_like(f) * 0.5
        s.cez[-1, :, :] = 0
        s.cez[:, -1, :] = 0

        descr = cuda.ArrayDescriptor3D()
        descr.width = s.nz
        descr.height = s.ny
        descr.depth = s.nx
        descr.format = cuda.dtype_to_array_format(f.dtype)
        descr.num_channels = 1
        descr.flags = 0
        s.tcex_gpu = cuda.Array(descr)
        s.tcey_gpu = cuda.Array(descr)
        s.tcez_gpu = cuda.Array(descr)

        mcpy = cuda.Memcpy3D()
        mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1]
        mcpy.src_height = mcpy.height = s.ny
        mcpy.depth = s.nx
        mcpy.set_src_host(s.cex)
        mcpy.set_dst_array(s.tcex_gpu)
        mcpy()
        mcpy.set_src_host(s.cey)
        mcpy.set_dst_array(s.tcey_gpu)
        mcpy()
        mcpy.set_src_host(s.cez)
        mcpy.set_dst_array(s.tcez_gpu)
        mcpy()

예제 #22

0

파일 보기

파일: 610-block3d-texture-smem.py 프로젝트: xj361685640/fdtd_accelerate

    descr = cuda.ArrayDescriptor3D()
    descr.width = nx
    descr.height = ny
    descr.depth = nz
    descr.format = cuda.dtype_to_array_format(f.dtype)
    descr.num_channels = 1
    descr.flags = 0

    cex_gpu = cuda.Array(descr)
    cey_gpu = cuda.Array(descr)
    cez_gpu = cuda.Array(descr)
    chx_gpu = cuda.Array(descr)
    chy_gpu = cuda.Array(descr)
    chz_gpu = cuda.Array(descr)

    mcopy = cuda.Memcpy3D()
    mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
    mcopy.src_height = mcopy.height = ny
    mcopy.depth = nz

    arrcopy(mcopy, set_c(f, (None, -1, -1)), cex_gpu)
    arrcopy(mcopy, set_c(f, (-1, None, -1)), cey_gpu)
    arrcopy(mcopy, set_c(f, (-1, -1, None)), cez_gpu)
    arrcopy(mcopy, set_c(f, (None, 0, 0)), chx_gpu)
    arrcopy(mcopy, set_c(f, (0, None, 0)), chy_gpu)
    arrcopy(mcopy, set_c(f, (0, 0, None)), chz_gpu)

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(kernels)
    update_e = mod.get_function("update_e")

예제 #23

0

파일 보기

파일: Common.py 프로젝트: babrodtk/ShallowWaterGPU

    def __init__(self,
                 stream,
                 nx,
                 ny,
                 nz,
                 x_halo,
                 y_halo,
                 z_halo,
                 cpu_data=None,
                 dtype=np.float32):
        self.logger = logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.x_halo = x_halo
        self.y_halo = y_halo
        self.z_halo = z_halo

        nx_halo = nx + 2 * x_halo
        ny_halo = ny + 2 * y_halo
        nz_halo = nz + 2 * z_halo

        #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
        self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)

        #For returning to download
        self.memorypool = PageLockedMemoryPool()

        #If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        #Make sure data is in proper format
        assert cpu_data.shape == (
            nz_halo, ny_halo, nx_halo) or cpu_data.shape == (
                self.nz, self.ny,
                self.nx), "Wrong shape of data %s vs %s / %s" % (str(
                    cpu_data.shape), str(
                        (self.nz, self.ny,
                         self.nx)), str((nz_halo, ny_halo, nx_halo)))
        assert cpu_data.itemsize == 4, "Wrong size of data type"
        assert not np.isfortran(
            cpu_data), "Wrong datatype (Fortran, expected C)"

        #Create copy object from host to device
        copy = cuda.Memcpy3D()
        copy.set_src_host(cpu_data)
        copy.set_dst_device(self.data.gpudata)

        #Set offsets of destination
        x_offset = (nx_halo - cpu_data.shape[2]) // 2
        y_offset = (ny_halo - cpu_data.shape[1]) // 2
        z_offset = (nz_halo - cpu_data.shape[0]) // 2
        copy.dst_x_in_bytes = x_offset * self.data.strides[1]
        copy.dst_y = y_offset
        copy.dst_z = z_offset

        #Set pitch of destination
        copy.dst_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        width = max(self.nx, cpu_data.shape[2])
        height = max(self.ny, cpu_data.shape[1])
        depth = max(self.nz, cpu - data.shape[0])
        copy.width_in_bytes = width * cpu_data.itemsize
        copy.height = height
        copy.depth = depth

        #Perform the copy
        copy(stream)

예제 #24

0

파일 보기

                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
                drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
            else:
                drv.memcpy_htod(dst.gpudata, src)
        return

    if len(shape) == 2:
        copy = drv.Memcpy2D()
    elif len(shape) == 3:
        copy = drv.Memcpy3D()
    else:
        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))

    if isinstance(src, GPUArray):
        copy.set_src_device(src.gpudata)
    else:
        copy.set_src_host(src)

    if isinstance(dst, GPUArray):
        copy.set_dst_device(dst.gpudata)
    else:
        copy.set_dst_host(dst)

    copy.width_in_bytes = src.dtype.itemsize*shape[0]

예제 #25

0

파일 보기

파일: Volt_Registration.py 프로젝트: jmmanley/VoltageImaging_pipeline

def register_multiple_images_subpix_cuda(stack, template):

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as drv
    import pycuda.cumath as cumath
    import skcuda.fft as cu_fft
    import skcuda.linalg as lin
    import skcuda.cublas as cub
    from numpy import pi, newaxis, floor
    import cmath
    from pycuda.elementwise import ElementwiseKernel
    from pycuda.compiler import SourceModule

    from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix
    from numpy.fft import fftshift, ifftshift
    fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1,
                                                   use_numpy_fft=False)

    mod = SourceModule("""
   #include <pycuda-complex.hpp>"
   
    __global__ void load_convert(unsigned short *a, float *b,int f, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = f * imlen;
        if (idx <imlen)
        {
            b[idx] = (float)a[offset+idx];
        }
    }
        
    __global__ void convert_export(float *a, unsigned short *b,int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ;
        }
    }
        
    __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx] = x[idx] * y[idx];
        }
    }
        
    __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = x[idx]._M_re;
            y[idx]._M_im = -x[idx]._M_im;
        }
    }
        
        
    __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = 0;
            y[idx]._M_im = x[idx] * sx;
        }
    }
        
    __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen,  int nlargeh, int nh)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = imlenl*3/4;
        if (idx<imlen)
        {
            int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl;
            x[target_ind] = y[idx];
        }      
    
    }    
        
    __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx]._M_re = 0;
            z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg;
        }
    }
        
    __global__ void sub_float(float *x, float *y, float sv,  int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            x[idx] = y[idx]-sv;
        }
    }
        

    """)

    load_convert_kernel = mod.get_function('load_convert')
    convert_export_kernel = mod.get_function('convert_export')
    convert_multiply_kernel = mod.get_function('convert_multiply')
    multiply_float_kernel = mod.get_function('multiply_comp_float')
    transfer_array_kernel = mod.get_function('transfer_array')
    calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix')
    conj_kernel = mod.get_function('calc_conj')
    sub_float_kernel = mod.get_function('sub_float')

    Z = stack.shape[0]
    M = stack.shape[1]
    N = stack.shape[2]
    max_memsize = 4200000000

    imlen = M * N
    half_imlen = M * (N // 2 + 1)
    grid_dim = (64, int(imlen / (512 * 64)) + 1, 1)
    block_dim = (512, 1, 1)  #512 threads per block

    stack_bin = int(max_memsize / (M * N * stack.itemsize))
    stack_ite = int(Z / stack_bin) + 1

    usfac = 100  ## needs to be bigger than 10

    if not template.shape == stack.shape[1:]:
        raise ValueError("Images must have same shape.")

    if np.any(np.isnan(template)):
        template = template.copy()
        template[template != template] = 0
    if np.any(np.isnan(stack)):
        stack = stack.copy()
        stack[stack != stack] = 0

    mlarge = M * 2
    nlarge = N * 2

    t = time.time()

    plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64)
    plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32)
    plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32)
    cub_h = cub.cublasCreate()

    template_gpu = gpuarray.to_gpu(template.astype('float32'))
    source_gpu = gpuarray.empty((M, N), np.float32)
    ifft_gpu = gpuarray.empty((M, N), np.float32)
    result_gpu = gpuarray.empty((M, N), np.uint16)

    templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64)

    cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True)
    templatef_gpu = templatef_gpu.conj()

    move_list = np.zeros((Z, 2))

    largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64)
    largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32)
    imlenl = mlarge * (nlarge // 2 + 1)

    zoom_factor = 1.5
    dftshiftG = trunc(ceil(usfac * zoom_factor) / 2)
    #% Center of output array at dftshift+1
    upsample_dim = int(ceil(usfac * zoom_factor))

    term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)).
              T[:, newaxis]) / N  # fftfreq  # output points
    term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :]
    term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis]
    term2r = (ifftshift(np.arange(M, dtype='float')) -
              floor(M / 2))[newaxis, :]  # fftfreq
    term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) +
                                             1), :].astype('float32'))
    term2c_gpu = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu = gpuarray.to_gpu(term1r.astype('float32'))
    term2r_gpu = gpuarray.to_gpu(term2r.astype('float32'))
    term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32'))

    kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32)
    kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32)
    kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64)
    kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64)

    Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M))
    Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N))
    [Nc, Nr] = np.meshgrid(Nc, Nr)
    Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32'))
    Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32'))

    upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64)
    upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64)

    source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype)
    copy = drv.Memcpy3D()
    copy.set_src_host(stack.data)
    copy.set_dst_device(source_stack.gpudata)
    copy.width_in_bytes = copy.src_pitch = stack.strides[1]
    copy.src_height = copy.height = M

    for zb in range(stack_ite):

        zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z))
        copy.depth = len(zrange)
        copy.src_z = int(zrange[0])
        copy()

        for i in range(len(zrange)):

            t = zb * stack_bin + i
            load_convert_kernel(source_stack,
                                source_gpu.gpudata,
                                np.int32(i),
                                np.int32(imlen),
                                block=block_dim,
                                grid=grid_dim)
            cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True)

            multiply_float_kernel(sourcef_gpu,
                                  templatef_gpu,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            transfer_array_kernel(largearray1_gpu,
                                  prod_gpu1,
                                  np.int32(imlenl),
                                  np.int32(half_imlen),
                                  np.int32(nlarge // 2 + 1),
                                  np.int32(N // 2 + 1),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(largearray1_gpu,
                        largearray2_gpu,
                        plan_inverse_big,
                        scale=True)
            peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size,
                                       largearray2_gpu.gpudata, 1)
            rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape)

            md2 = trunc(mlarge / 2)
            nd2 = trunc(nlarge / 2)
            if rloc > md2:
                row_shift2 = rloc - mlarge
            else:
                row_shift2 = rloc
            if cloc > nd2:
                col_shift2 = cloc - nlarge
            else:
                col_shift2 = cloc
            row_shiftG = row_shift2 / 2.
            col_shiftG = col_shift2 / 2.

            # Initial shift estimate in upsampled grid

            row_shiftG0 = round(row_shiftG * usfac) / usfac
            col_shiftG0 = round(col_shiftG * usfac) / usfac
            # Matrix multiply DFT around the current shift estimate
            roffG = dftshiftG - row_shiftG0 * usfac
            coffG = dftshiftG - col_shiftG0 * usfac

            sub_float_kernel(term2c_gpu,
                             term2c_gpu_ori,
                             np.float32(coffG / usfac),
                             np.int32(term2c_gpu.size),
                             block=block_dim,
                             grid=grid_dim)
            sub_float_kernel(term1r_gpu,
                             term1r_gpu_ori,
                             np.float32(roffG),
                             np.int32(term1r_gpu.size),
                             block=block_dim,
                             grid=grid_dim)

            lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu)
            lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu)
            convert_multiply_kernel(kernc_gpu,
                                    kernc_gpuc,
                                    np.float32(-2 * pi),
                                    np.int32(kernc_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            convert_multiply_kernel(kernr_gpu,
                                    kernr_gpuc,
                                    np.float32(-2 * pi / (M * usfac)),
                                    np.int32(kernr_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(kernc_gpuc, out=kernc_gpuc)
            cumath.exp(kernr_gpuc, out=kernr_gpuc)

            conj_kernel(prod_gpu1,
                        prod_gpu2,
                        np.int32(half_imlen),
                        block=block_dim,
                        grid=grid_dim)

            lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1)
            lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2)

            CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2)
            rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape)
            CCGmax = CCG[rlocG, clocG]

            rlocG = rlocG - dftshiftG  #+ 1 # +1 # questionable/failed hack + 1;
            clocG = clocG - dftshiftG  #+ 1 # -1 # questionable/failed hack - 1;
            row_shiftG = row_shiftG0 + rlocG / usfac
            col_shiftG = col_shiftG0 + clocG / usfac

            diffphaseG = arctan2(imag(CCGmax), real(CCGmax))

            # Compute registered version of source stack
            calc_shiftmatrix_kernel(Nr_gpu,
                                    Nc_gpu,
                                    shiftmatrix,
                                    np.float32(row_shiftG * 2 * np.pi),
                                    np.float32(col_shiftG * 2 * np.pi),
                                    np.float32(diffphaseG),
                                    np.int32(half_imlen),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(shiftmatrix, out=shiftmatrix)
            multiply_float_kernel(sourcef_gpu,
                                  shiftmatrix,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse)
            convert_export_kernel(ifft_gpu,
                                  result_gpu,
                                  np.int32(imlen),
                                  block=block_dim,
                                  grid=grid_dim)

            move_list[t, :] = (row_shiftG, col_shiftG)
            stack[t, :, :] = result_gpu.get()

    cub.cublasDestroy(cub_h)
    return (stack, move_list)

예제 #26

0

파일 보기

파일: gpu_ndarray.py 프로젝트: eucall-software/pyDive

def copy_non_contiguous(dst, src):
    """Copy ``src`` array to ``dst`` array. A gpu-array may have a non contiguous block of memory,
    i.e. it may have substancial pitches/strides. However a cpu-array must have a contiguous block of memory.
    All four directions are allowed.
    """
    assert src.dtype == dst.dtype,\
        "src ({}) and dst ({}) must have the same datatype.".format(str(src.dtype), str(dst.dtype))
    assert dst.shape == src.shape,\
        "Shapes do not match: " + str(dst.shape) + " <-> " + str(src.shape)

    itemsize = np.dtype(src.dtype).itemsize
    copy = cuda.Memcpy2D()
    src_on_gpu = isinstance(src, pycuda.gpuarray.GPUArray)
    dst_on_gpu = isinstance(dst, pycuda.gpuarray.GPUArray)
    if src_on_gpu:
        copy.set_src_device(src.gpudata)
    else:
        copy.set_src_host(src)
    if dst_on_gpu:
        copy.set_dst_device(dst.gpudata)
    else:
        copy.set_dst_host(dst)

    if len(src.shape) == 1:
        copy.src_pitch = src.strides[0] if src_on_gpu else itemsize
        copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize
        copy.width_in_bytes = itemsize
        copy.height = src.shape[0]
        copy(aligned=False)

    elif len(src.shape) == 2:
        if (itemsize != src.strides[1] if src_on_gpu else False) or \
           (itemsize != dst.strides[1] if dst_on_gpu else False):
            # arrays have to be copied column by column, because there a two substantial pitches/strides
            # which is not supported by cuda.
            copy.src_pitch = src.strides[0] if src_on_gpu else itemsize
            copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize
            copy.width_in_bytes = itemsize
            copy.height = src.shape[0]

            for col in range(src.shape[1]):
                copy.src_x_in_bytes = col * src.strides[
                    1] if src_on_gpu else col * itemsize
                copy.dst_x_in_bytes = col * dst.strides[
                    1] if dst_on_gpu else col * itemsize
                copy(aligned=False)
        else:
            # both arrays have a contiguous block of memory for each row
            copy.src_pitch = src.strides[
                0] if src_on_gpu else itemsize * src.shape[1]
            copy.dst_pitch = dst.strides[
                0] if dst_on_gpu else itemsize * src.shape[1]
            copy.width_in_bytes = itemsize * src.shape[1]
            copy.height = src.shape[0]
            copy(aligned=False)

    elif len(src.shape) == 3:
        if (src.strides[0] != src.shape[1] * src.strides[1] if src_on_gpu else False) or \
           (dst.strides[0] != dst.shape[1] * dst.strides[1] if dst_on_gpu else False):
            # arrays have to be copied plane by plane, because there a substantial pitche/stride
            # for the z-axis which is not supported by cuda.
            for plane in range(src.shape[0]):
                copy_non_contiguous(dst[plane, :, :], src[plane, :, :])
            return

        copy = cuda.Memcpy3D()
        if src_on_gpu:
            copy.set_src_device(src.gpudata)
        else:
            copy.set_src_host(src)
        if dst_on_gpu:
            copy.set_dst_device(dst.gpudata)
        else:
            copy.set_dst_host(dst)

        copy.src_pitch = src.strides[
            1] if src_on_gpu else itemsize * src.shape[2]
        copy.dst_pitch = dst.strides[
            1] if dst_on_gpu else itemsize * src.shape[2]
        copy.width_in_bytes = itemsize * src.shape[2]
        copy.height = copy.src_height = copy.dst_height = src.shape[1]
        copy.depth = src.shape[0]

        copy()
    else:
        raise RuntimeError("dimension %d is not supported." % len(src.shape))

예제 #27

0

파일 보기

def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False):
  ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py'''
  # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags
#   if order != "C": raise LogicError("Just implemented for C order")
  dimension = len(npArray.shape)
  case = order in ["C","F"]
  if not case:
    raise LogicError("order must be either F or C")
#   if dimension == 1:
#       w = npArray.shape[0]
#       h, d = 0,0
  if dimension == 2:
      if order == "C": stride = 0
      if order == "F": stride = -1
      h, w = npArray.shape
      d = 1
      if allowSurfaceBind:
        descrArr = cuda.ArrayDescriptor3D()
        descrArr.width = w
        descrArr.height = h
        descrArr.depth = d
      else:
        descrArr = cuda.ArrayDescriptor()
        descrArr.width = w
        descrArr.height = h
#         descrArr.depth = d
  elif dimension == 3:
      if order == "C": stride = 1
      if order == "F": stride = 1
      d, h, w = npArray.shape
      descrArr = cuda.ArrayDescriptor3D()
      descrArr.width = w
      descrArr.height = h
      descrArr.depth = d
  else:
      raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ")
  if prec == 'float':
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1
  elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im)
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
    descrArr.num_channels = 2
  elif prec == 'double': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
    descrArr.num_channels = 2
  elif prec == 'cdouble': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
    descrArr.num_channels = 4
  else:
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1

  if allowSurfaceBind:
    if dimension==2:  descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED
    descrArr.flags |= cuda.array3d_flags.SURFACE_LDST

  cudaArray = cuda.Array(descrArr)
  if allowSurfaceBind or dimension==3 :
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
#     if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
#     if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
  else:
#     if dimension == 3:
#       copy3D = cuda.Memcpy3D()
#       copy3D.set_src_host(npArray)
#       copy3D.set_dst_array(cudaArray)
#       copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
# #       if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
# #       if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
#       copy3D.src_height = copy3D.height = h
#       copy3D.depth = d
#       copy3D()
#       return cudaArray, copy3D
#     if dimension == 2:
      cudaArray = cuda.Array(descrArr)
      copy2D = cuda.Memcpy2D()
      copy2D.set_src_host(npArray)
      copy2D.set_dst_array(cudaArray)
      copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride]
#       copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support
      copy2D.src_height = copy2D.height = h
      copy2D(aligned=True)
      return cudaArray, copy2D