예제 #1
0
def copy2D_array_to_device(dst, src, type_sz, width, height):
    copy = cuda_driver.Memcpy2D()
    copy.set_src_array(src)
    copy.set_dst_device(dst)
    copy.height = height
    copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width * type_sz
    copy(aligned=True)
예제 #2
0
def get_Memcpy2D_d2d(src, dst, src_pitch, dst_pitch, dim_args, itemsize,
                     **kwargs):
    ''' Wrapper for the pycuda.driver.Memcpy2d() function (same args)
    Returns a callable object which copies the arrays on invocation of ()
    dim_args: list, [width, height, depth] !not width_in_bytes
    kwargs: gets ignored, exists to provide a uniform interface with 3d
    '''
    height, width = dim_args
    width_in_bytes = width * itemsize
    src_ptr = getattr(src, 'gpudata', 0) # set to NULL if no valid ptr
    dst_ptr = getattr(dst, 'gpudata', 0) # set to NULL if no valid ptr
    cpy = drv.Memcpy2D()
    cpy.set_src_device(src_ptr)
    cpy.set_dst_device(dst_ptr)
    cpy.height = np.int64(height)
    cpy.width_in_bytes = np.int64(width_in_bytes)
    cpy.src_pitch = src_pitch
    cpy.dst_pitch = dst_pitch
    class _copy():
        ''' Proxy class for the memcpy2d object:
        Wrap the call to pass aligned=True which seems to be necessary
        in the 2D version (compared to 3D where it doesn't work with this arg
        Add the set_src_device and set_dst_device proxy methods to be able
        to set the src/dst
        '''
        def __init__(self, memcpy2d):
            self.cpy = memcpy2d
        def set_src_device(self, src_ptr):
            self.cpy.set_src_device(src_ptr)
        def set_dst_device(self, dst_ptr):
            self.cpy.set_dst_device(dst_ptr)
        def __call__(self):
            self.cpy(aligned=True)
    return _copy(cpy)
예제 #3
0
    def enqueue(self, batch_size, inputs, output, workspace, stream):
        ELEM_SIZE = 4
        in_dims = list(self.in_dims)
        last = 0
        for i, pos in enumerate(self.sections):
            acc_size_lo, acc_size_hi = 1, 1
            for d in list(self.in_dims)[self.axis + 1:]:
                acc_size_lo *= d.size
            for d in list(self.in_dims)[:self.axis]:
                acc_size_hi *= d.size

            src_offset = acc_size_lo * last
            src_pitch = acc_size_lo * self.in_dims[self.axis].size
            dst_pitch = acc_size_lo * (pos - last)
            height = acc_size_hi * batch_size

            copy = cuda.Memcpy2D()
            copy.set_src_device(int(inputs[0]))
            copy.set_dst_device(int(output[i]))
            copy.src_x_in_bytes = src_offset * ELEM_SIZE
            copy.src_pitch = src_pitch * ELEM_SIZE
            copy.dst_pitch = dst_pitch * ELEM_SIZE
            copy.width_in_bytes = dst_pitch * ELEM_SIZE
            copy.height = height
            copy(stream)
            last = pos
예제 #4
0
파일: matrix.py 프로젝트: riccitensor/hebel
def extract_columns(mat, start=0, stop=None, target=None):
    dtype = mat.dtype
    itemsize = np.dtype(dtype).itemsize
    N, M = mat.shape
    if stop is None:
        stop = M
    m = stop - start

    assert mat.flags.c_contiguous
    assert start >= 0 and start <= M and stop >= 0 and \
        stop <= M and stop > start

    if target is None:
        target = gpuarray.empty((N, m), dtype)

    copy = drv.Memcpy2D()
    copy.set_src_device(mat.gpudata)
    copy.src_x_in_bytes = start * itemsize
    copy.set_dst_device(target.gpudata)
    copy.src_pitch = M * itemsize
    copy.dst_pitch = copy.width_in_bytes = m * itemsize
    copy.height = N
    copy(aligned=True)

    return target
예제 #5
0
    def upload(self, stream, cpu_data, extent=None):
        if (extent is None):
            x = self.x_halo
            y = self.y_halo
            nx = self.nx
            ny = self.ny
        else:
            x, y, nx, ny = extent

        assert (nx == cpu_data.shape[1])
        assert (ny == cpu_data.shape[0])
        assert (x + nx <= self.nx + 2 * self.x_halo)
        assert (y + ny <= self.ny + 2 * self.y_halo)

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_dst_device(self.data.gpudata)
        copy.set_src_host(cpu_data)

        #Set offsets and pitch of source
        copy.dst_x_in_bytes = int(x) * self.data.strides[1]
        copy.dst_y = int(y)
        copy.dst_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = int(nx) * cpu_data.itemsize
        copy.height = int(ny)

        copy(stream)
    def test_pycuda_memcpy_Surface_Surface(self):

        while True:
            surf_src = self.nvDec.DecodeSingleSurface()
            if surf_src.Empty():
                break
            src_plane = surf_src.PlanePtr()

            surf_dst = nvc.Surface.Make(self.nvDec.Format(),
                                        self.nvDec.Width(),
                                        self.nvDec.Height(), self.gpu_id)
            self.assertFalse(surf_dst.Empty())
            dst_plane = surf_dst.PlanePtr()

            memcpy_2d = cuda.Memcpy2D()
            memcpy_2d.width_in_bytes = src_plane.Width() * src_plane.ElemSize()
            memcpy_2d.src_pitch = src_plane.Pitch()
            memcpy_2d.dst_pitch = dst_plane.Pitch()
            memcpy_2d.width = src_plane.Width()
            memcpy_2d.height = src_plane.Height()
            memcpy_2d.set_src_device(src_plane.GpuMem())
            memcpy_2d.set_dst_device(dst_plane.GpuMem())
            memcpy_2d(self.cuda_str)

            frame_src = np.ndarray(shape=(0), dtype=np.uint8)
            if not self.nvDwn.DownloadSingleSurface(surf_src, frame_src):
                self.fail('Failed to download decoded surface')

            frame_dst = np.ndarray(shape=(0), dtype=np.uint8)
            if not self.nvDwn.DownloadSingleSurface(surf_dst, frame_dst):
                self.fail('Failed to download decoded surface')

            if not np.array_equal(frame_src, frame_dst):
                self.fail('Video frames are not equal')
예제 #7
0
    def download(self, stream, asynch=False):
        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
        #Allocate host memory
        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
        #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
        cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx),
                                            dtype=np.float32)

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_src_device(self.data.gpudata)
        copy.set_dst_host(cpu_data)

        #Set offsets and pitch of source
        copy.src_x_in_bytes = self.x_halo * self.data.strides[1]
        copy.src_y = self.y_halo
        copy.src_z = self.z_halo
        copy.src_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = self.nx * cpu_data.itemsize
        copy.height = self.ny
        copy.depth = self.nz

        copy(stream)
        if asynch == False:
            stream.synchronize()

        return cpu_data
예제 #8
0
def copy2D_host_to_array(arr, host, width, height):
    copy = driver.Memcpy2D()
    copy.set_src_host(host)
    copy.set_dst_array(arr)
    copy.height = height
    copy.width_in_bytes = copy.src_pitch = width
    copy.height = height
    copy(aligned=True)
예제 #9
0
def copy_2d_host_to_device(dev, host, src_pitch, dst_pitch, width, height):
    c = driver.Memcpy2D()
    c.set_src_host(host)
    c.set_dst_device(dev)
    c.src_pitch = src_pitch
    c.dst_pitch = dst_pitch
    c.width_in_bytes = width
    c.height = height
    c(aligned=True)
예제 #10
0
def copy2D_device_to_host(host, dev, src_pitch, dst_pitch, width, height):
    copy = driver.Memcpy2D()
    copy.set_src_device(dev)
    copy.set_dst_host(host)
    copy.src_pitch = src_pitch
    copy.dst_pitch = dst_pitch
    copy.width_in_bytes = width
    copy.height = height
    copy(aligned=True)
def predict(data):
    print('data shape:', data.shape)
    batch_size = data.shape[0]
    print("TRT batch_size:", batch_size)

    d_input = cuda.mem_alloc(data.nbytes)
    cuda.memcpy_htod(d_input, data)

    time_step = data.shape[2] // 4
    print('timestep:', time_step)
    h_time_step = np.array([time_step] * batch_size, np.int32)
    d_time_step = cuda.mem_alloc(h_time_step.nbytes)
    cuda.memcpy_htod(d_time_step, h_time_step)

    d_cnn_output = cuda.mem_alloc(batch_size * time_step * 16 * 4)
    d_lstm_input = cuda.mem_alloc(batch_size * max_time_step * 16 * 4)
    output = np.empty((batch_size, max_time_step), dtype=np.int32)
    d_output = cuda.mem_alloc(output.nbytes)

    predictor_conv = TrtPredictor_Conv(False)
    predictor_lstm = TrtPredictor_Lstm(False)

    n_round = 1
    time0 = time.time()
    for _ in range(n_round):
        #start = time.time()
        predictor_conv.infer(data.shape, d_input, d_cnn_output)

        m = cuda.Memcpy2D()
        m.src_pitch = time_step * 16 * 4
        m.dst_pitch = max_time_step * 16 * 4
        m.width_in_bytes = m.src_pitch
        m.height = batch_size
        m.set_src_device(d_cnn_output)
        m.set_dst_device(d_lstm_input)
        m(False)

        predictor_lstm.infer(batch_size, d_lstm_input, d_time_step, d_output)
        #print "tensorrt forward batch spend : {}".format((time.time() - start) / 1.0)
    cuda.Context.synchronize()
    print("TRT average:", (time.time() - time0) * 1.0 / n_round)

    cuda.memcpy_dtoh(output, d_output)
    print(output)
    for k in range(len(output)):
        cur = None
        seq = []
        for i in output[k]:
            if cur == i:
                continue
            seq.append(i)
            cur = i
        print([chr(ord('a') + i - 1) for i in seq if i != 0])
예제 #12
0
def update_2d_texture(texref, newdata):
    arr = texref.get_array()
    newdata = numpy.ascontiguousarray(newdata)
    h, w = newdata.shape
    
    desc = arr.get_descriptor()
    assert h == desc.height and w == desc.width
    assert desc.num_channels == 1
    
    copy = cuda.Memcpy2D()
    copy.set_src_host(newdata)
    copy.set_dst_array(arr)
    copy.width_in_bytes = copy.src_pitch = newdata.strides[0]
    copy.src_height = copy.height = h
    copy(True)
예제 #13
0
def gpuArray2DtocudaArray(gpuArray):
    #import pycuda.autoinit
    h, w = gpuArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype)
    descr2D.num_channels = 1
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_device(gpuArray.ptr)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = gpuArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, copy2D
예제 #14
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
예제 #15
0
파일: matrix.py 프로젝트: riccitensor/hebel
def insert_columns(src, dst, offset):
    dtype = src.dtype
    itemsize = np.dtype(dtype).itemsize
    h_src, w_src = src.shape
    h_dst, w_dst = dst.shape

    assert dst.dtype == dtype
    assert h_src == h_dst
    assert w_dst >= offset + w_src

    copy = drv.Memcpy2D()
    copy.set_src_device(src.gpudata)
    copy.set_dst_device(dst.gpudata)
    copy.dst_x_in_bytes = offset * itemsize
    copy.src_pitch = copy.width_in_bytes = w_src * itemsize
    copy.dst_pitch = w_dst * itemsize
    copy.height = h_src
    copy(aligned=True)
예제 #16
0
def extract_columns(mat, start=0, stop=None, target=None):
    dtype = mat.dtype
    itemsize = np.dtype(dtype).itemsize

    input_3d = False
    if len(mat.shape) == 2:
        N, M = mat.shape
        if stop is None:
            stop = M
    elif len(mat.shape) == 3:
        input_3d = True
        N, M, Z = mat.shape
        if stop is None:
            stop = M
        start = start * Z
        stop = stop * Z
        M = M * Z
        mat = mat.reshape((N, M))
    else:
        raise ValueError("mat must have two or three dimensions")
    m = stop - start

    assert mat.flags.c_contiguous
    assert start >= 0 and start <= M and stop >= 0 and \
        stop <= M and stop > start

    if target is None:
        target = gpuarray.empty((N, m), dtype, allocator=memory_pool.allocate)

    copy = drv.Memcpy2D()
    copy.set_src_device(mat.gpudata)
    copy.src_x_in_bytes = start * itemsize
    copy.set_dst_device(target.gpudata)
    copy.src_pitch = M * itemsize
    copy.dst_pitch = copy.width_in_bytes = m * itemsize
    copy.height = N
    copy(aligned=True)

    if input_3d:
        assert not m % Z
        target = target.reshape((N, m // Z, Z))

    return target
예제 #17
0
def np2DtoCudaArray(npArray, allowSurfaceBind=False):
    #import pycuda.autoinit
    h, w = npArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(npArray.dtype)
    descr2D.num_channels = 1
    if allowSurfaceBind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_host(npArray)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = npArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, descr2D
예제 #18
0
    def download(self, stream, cpu_data=None, asynch=False, extent=None):
        if (extent is None):
            x = self.x_halo
            y = self.y_halo
            nx = self.nx
            ny = self.ny
        else:
            x, y, nx, ny = extent

        if (cpu_data is None):
            #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
            #Allocate host memory
            #The following fails, don't know why (crashes python)
            #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
            #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
            cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)

        assert nx == cpu_data.shape[1]
        assert ny == cpu_data.shape[0]
        assert x + nx <= self.nx + 2 * self.x_halo
        assert y + ny <= self.ny + 2 * self.y_halo

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_src_device(self.data.gpudata)
        copy.set_dst_host(cpu_data)

        #Set offsets and pitch of source
        copy.src_x_in_bytes = int(x) * self.data.strides[1]
        copy.src_y = int(y)
        copy.src_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = int(nx) * cpu_data.itemsize
        copy.height = int(ny)

        copy(stream)
        if asynch == False:
            stream.synchronize()

        return cpu_data
예제 #19
0
def pad_array(mat, left=0, right=0, val=0., new_shape=None, stream=None):
    assert mat.flags.c_contiguous

    is_chararray = False
    if mat.dtype == '|S1':
        is_chararray = True
        mat.dtype = np.int8
        if type(val) is str:
            val = ord(val)
    
    if len(mat.shape) == 2:
        height, width = mat.shape
    elif len(mat.shape) > 2:
        height = mat.shape[0]
        width = np.prod(mat.shape[1:])
        mat = mat.reshape((height, width))
    else:
        raise ValueError('Array must be at least two-dimensional.')

    padded_width = width + left + right

    padded_mat = gpuarray.empty((height, padded_width), dtype=mat.dtype,
                                allocator=memory_pool.allocate).fill(val)

    itemsize = np.dtype(padded_mat.dtype).itemsize
    copy = drv.Memcpy2D()
    copy.set_src_device(mat.gpudata)
    copy.set_dst_device(padded_mat.gpudata)
    copy.dst_x_in_bytes = left * itemsize
    copy.src_pitch = copy.width_in_bytes = width * itemsize
    copy.dst_pitch = padded_width * itemsize
    copy.height = height
    copy(stream)

    if new_shape is not None:
        padded_mat = padded_mat.reshape(new_shape)

    if is_chararray:
        mat.dtype = np.dtype('|S1')
        padded_mat.dtype = np.dtype('|S1')
        
    return padded_mat
    def test_pycuda_memcpy_Surface_Tensor(self):

        while True:
            surf_src = self.nvDec.DecodeSingleSurface()
            if surf_src.Empty():
                break
            src_plane = surf_src.PlanePtr()

            surface_tensor = torch.zeros(
                src_plane.Height(),
                src_plane.Width(),
                1,
                dtype=torch.uint8,
                device=torch.device(f'cuda:{self.gpu_id}'))
            dst_plane = surface_tensor.data_ptr()

            memcpy_2d = cuda.Memcpy2D()
            memcpy_2d.width_in_bytes = src_plane.Width() * src_plane.ElemSize()
            memcpy_2d.src_pitch = src_plane.Pitch()
            memcpy_2d.dst_pitch = self.nvDec.Width()
            memcpy_2d.width = src_plane.Width()
            memcpy_2d.height = src_plane.Height()
            memcpy_2d.set_src_device(src_plane.GpuMem())
            memcpy_2d.set_dst_device(dst_plane)
            memcpy_2d(self.cuda_str)

            frame_src = np.ndarray(shape=(0), dtype=np.uint8)
            if not self.nvDwn.DownloadSingleSurface(surf_src, frame_src):
                self.fail('Failed to download decoded surface')

            frame_dst = surface_tensor.to('cpu').numpy()
            frame_dst = frame_dst.reshape(
                (src_plane.Height() * src_plane.Width()))

            if not np.array_equal(frame_src, frame_dst):
                self.fail('Video frames are not equal')
예제 #21
0
print original
'''
destImage_gpu = cuda.mem_alloc_like(original)
sourceImage_gpu = cuda.mem_alloc_like(original)
intermediateImage_gpu = cuda.mem_alloc_like(original)
'''
destImage_gpu, pit = cuda.mem_alloc_pitch(7 * 4, 2,
                                          numpy.dtype(numpy.float32).itemsize)
sourceImage_gpu, pit2 = cuda.mem_alloc_pitch(
    7 * 4, 2,
    numpy.dtype(numpy.float32).itemsize)
print pit, pit2

#cuda.memcpy_htod(sourceImage_gpu, original)
#cuda.memcpy_htod(destImage_gpu, original)
copy = cuda.Memcpy2D()
copy.set_src_host(original)
copy.set_dst_device(destImage_gpu)
copy.height = 2
copy.width_in_bytes = 7 * 4
copy.src_pitch = 7 * 4
copy.dst_pitch = 128 * 4
copy(aligned=True)

destImage = original.copy()

func(destImage_gpu,
     sourceImage_gpu,
     numpy.int32(10),
     numpy.int32(2),
     block=(10, 1, 1),
예제 #22
0
def copy_non_contiguous(dst, src):
    """Copy ``src`` array to ``dst`` array. A gpu-array may have a non contiguous block of memory,
    i.e. it may have substancial pitches/strides. However a cpu-array must have a contiguous block of memory.
    All four directions are allowed.
    """
    assert src.dtype == dst.dtype,\
        "src ({}) and dst ({}) must have the same datatype.".format(str(src.dtype), str(dst.dtype))
    assert dst.shape == src.shape,\
        "Shapes do not match: " + str(dst.shape) + " <-> " + str(src.shape)

    itemsize = np.dtype(src.dtype).itemsize
    copy = cuda.Memcpy2D()
    src_on_gpu = isinstance(src, pycuda.gpuarray.GPUArray)
    dst_on_gpu = isinstance(dst, pycuda.gpuarray.GPUArray)
    if src_on_gpu:
        copy.set_src_device(src.gpudata)
    else:
        copy.set_src_host(src)
    if dst_on_gpu:
        copy.set_dst_device(dst.gpudata)
    else:
        copy.set_dst_host(dst)

    if len(src.shape) == 1:
        copy.src_pitch = src.strides[0] if src_on_gpu else itemsize
        copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize
        copy.width_in_bytes = itemsize
        copy.height = src.shape[0]
        copy(aligned=False)

    elif len(src.shape) == 2:
        if (itemsize != src.strides[1] if src_on_gpu else False) or \
           (itemsize != dst.strides[1] if dst_on_gpu else False):
            # arrays have to be copied column by column, because there a two substantial pitches/strides
            # which is not supported by cuda.
            copy.src_pitch = src.strides[0] if src_on_gpu else itemsize
            copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize
            copy.width_in_bytes = itemsize
            copy.height = src.shape[0]

            for col in range(src.shape[1]):
                copy.src_x_in_bytes = col * src.strides[
                    1] if src_on_gpu else col * itemsize
                copy.dst_x_in_bytes = col * dst.strides[
                    1] if dst_on_gpu else col * itemsize
                copy(aligned=False)
        else:
            # both arrays have a contiguous block of memory for each row
            copy.src_pitch = src.strides[
                0] if src_on_gpu else itemsize * src.shape[1]
            copy.dst_pitch = dst.strides[
                0] if dst_on_gpu else itemsize * src.shape[1]
            copy.width_in_bytes = itemsize * src.shape[1]
            copy.height = src.shape[0]
            copy(aligned=False)

    elif len(src.shape) == 3:
        if (src.strides[0] != src.shape[1] * src.strides[1] if src_on_gpu else False) or \
           (dst.strides[0] != dst.shape[1] * dst.strides[1] if dst_on_gpu else False):
            # arrays have to be copied plane by plane, because there a substantial pitche/stride
            # for the z-axis which is not supported by cuda.
            for plane in range(src.shape[0]):
                copy_non_contiguous(dst[plane, :, :], src[plane, :, :])
            return

        copy = cuda.Memcpy3D()
        if src_on_gpu:
            copy.set_src_device(src.gpudata)
        else:
            copy.set_src_host(src)
        if dst_on_gpu:
            copy.set_dst_device(dst.gpudata)
        else:
            copy.set_dst_host(dst)

        copy.src_pitch = src.strides[
            1] if src_on_gpu else itemsize * src.shape[2]
        copy.dst_pitch = dst.strides[
            1] if dst_on_gpu else itemsize * src.shape[2]
        copy.width_in_bytes = itemsize * src.shape[2]
        copy.height = copy.src_height = copy.dst_height = src.shape[1]
        copy.depth = src.shape[0]

        copy()
    else:
        raise RuntimeError("dimension %d is not supported." % len(src.shape))
예제 #23
0
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False):
  ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py'''
  # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags
#   if order != "C": raise LogicError("Just implemented for C order")
  dimension = len(npArray.shape)
  case = order in ["C","F"]
  if not case:
    raise LogicError("order must be either F or C")
#   if dimension == 1:
#       w = npArray.shape[0]
#       h, d = 0,0
  if dimension == 2:
      if order == "C": stride = 0
      if order == "F": stride = -1
      h, w = npArray.shape
      d = 1
      if allowSurfaceBind:
        descrArr = cuda.ArrayDescriptor3D()
        descrArr.width = w
        descrArr.height = h
        descrArr.depth = d
      else:
        descrArr = cuda.ArrayDescriptor()
        descrArr.width = w
        descrArr.height = h
#         descrArr.depth = d
  elif dimension == 3:
      if order == "C": stride = 1
      if order == "F": stride = 1
      d, h, w = npArray.shape
      descrArr = cuda.ArrayDescriptor3D()
      descrArr.width = w
      descrArr.height = h
      descrArr.depth = d
  else:
      raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ")
  if prec == 'float':
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1
  elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im)
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
    descrArr.num_channels = 2
  elif prec == 'double': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
    descrArr.num_channels = 2
  elif prec == 'cdouble': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
    descrArr.num_channels = 4
  else:
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1

  if allowSurfaceBind:
    if dimension==2:  descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED
    descrArr.flags |= cuda.array3d_flags.SURFACE_LDST

  cudaArray = cuda.Array(descrArr)
  if allowSurfaceBind or dimension==3 :
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
#     if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
#     if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
  else:
#     if dimension == 3:
#       copy3D = cuda.Memcpy3D()
#       copy3D.set_src_host(npArray)
#       copy3D.set_dst_array(cudaArray)
#       copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
# #       if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
# #       if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
#       copy3D.src_height = copy3D.height = h
#       copy3D.depth = d
#       copy3D()
#       return cudaArray, copy3D
#     if dimension == 2:
      cudaArray = cuda.Array(descrArr)
      copy2D = cuda.Memcpy2D()
      copy2D.set_src_host(npArray)
      copy2D.set_dst_array(cudaArray)
      copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride]
#       copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support
      copy2D.src_height = copy2D.height = h
      copy2D(aligned=True)
      return cudaArray, copy2D
예제 #24
0
파일: parray.py 프로젝트: bionet/vtem
def _assignshape(shape, axis, value):
    a = []
    for i in range(len(shape)):
        if i == axis:
            a.append(value)
        else:
            a.append(shape[i])
    return tuple(a) 


def PitchTrans(shape, dst, dst_ld, src, src_ld, dtype, aligned=False, async = False, stream = None):    
    size = np.dtype(dtype).itemsize
    
    
    trans = cuda.Memcpy2D()
    trans.src_pitch = src_ld * size
    if isinstance(src, (cuda.DeviceAllocation, int, long)):
        trans.set_src_device(src)
    else:
        trans.set_src_host(src)
    
    trans.dst_pitch = dst_ld * size
    if isinstance(dst, (cuda.DeviceAllocation, int, long)):
        trans.set_dst_device(dst)
    else:
        trans.set_dst_host(dst)
    
    trans.width_in_bytes = _pd(shape) * size
    trans.height = int(shape[0])
    
예제 #25
0
                # So, we attempt to get a contiguous view of dst.
                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
                if async:
                    drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                else:
                    drv.memcpy_dtoh(dst, src.gpudata)
        else:
            src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
            if async:
                drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
            else:
                drv.memcpy_htod(dst.gpudata, src)
        return

    if len(shape) == 2:
        copy = drv.Memcpy2D()
    elif len(shape) == 3:
        copy = drv.Memcpy3D()
    else:
        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))

    if isinstance(src, GPUArray):
        copy.set_src_device(src.gpudata)
    else:
        copy.set_src_host(src)

    if isinstance(dst, GPUArray):
        copy.set_dst_device(dst.gpudata)
    else:
        copy.set_dst_host(dst)