Пример #1
0
def create_2D_array(mat):
    descr = driver.ArrayDescriptor()
    descr.width = mat.shape[1]
    descr.height = mat.shape[0]
    descr.format = driver.dtype_to_array_format(mat.dtype)
    descr.num_channels = 1
    descr.flags = 0
    ary = driver.Array(descr)
    return ary
Пример #2
0
def gpuArray2DtocudaArray(gpuArray):
    #import pycuda.autoinit
    h, w = gpuArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype)
    descr2D.num_channels = 1
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_device(gpuArray.ptr)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = gpuArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, copy2D
Пример #3
0
def resize_gpu(y_gpu, out_shape):

  in_shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  if dtype != np.float32:
    raise NotImplementedException('Only float at the moment')
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])),
               int(np.ceil(float(out_shape[0])/block_size[1])))

  preproc = _generate_preproc(dtype)
  mod = SourceModule(preproc + resize_code, keep=True)

  resize_fun_gpu = mod.get_function("resize")
  resized_gpu = cua.empty(tuple((np.int(out_shape[0]),
                                 np.int(out_shape[1]))),y_gpu.dtype)

  temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1],
                                       y_gpu.shape[0],
                                       4)
  copy_object = cu.Memcpy2D()
  copy_object.set_src_device(y_gpu.gpudata)
  copy_object.set_dst_device(temp_gpu)
  copy_object.src_pitch = 4 * y_gpu.shape[1]
  copy_object.dst_pitch = pitch
  copy_object.width_in_bytes = 4 * y_gpu.shape[1]
  copy_object.height = y_gpu.shape[0]
  copy_object(aligned=False)
  in_tex = mod.get_texref('in_tex')
  descr = cu.ArrayDescriptor()
  descr.width = y_gpu.shape[1]
  descr.height = y_gpu.shape[0]
  descr.format = cu.array_format.FLOAT
  descr.num_channels = 1
  #pitch = y_gpu.nbytes / y_gpu.shape[0]
  in_tex.set_address_2d(temp_gpu, descr, pitch)
  in_tex.set_filter_mode(cu.filter_mode.LINEAR)
  in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES)
    
  resize_fun_gpu(resized_gpu.gpudata,
                 np.uint32(out_shape[0]), np.uint32(out_shape[1]),
                 block=block_size, grid=grid_size)
  temp_gpu.free()

  return resized_gpu
Пример #4
0
def np2DtoCudaArray(npArray, allowSurfaceBind=False):
    #import pycuda.autoinit
    h, w = npArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(npArray.dtype)
    descr2D.num_channels = 1
    if allowSurfaceBind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_host(npArray)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = npArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, descr2D
Пример #5
0
def setup_pitched_texture(tex_ref, shape, pitch, alloc):
    """
    Bind 2D texture to memory location given by alloc with pitch size pitch and shape shape.
    alloc and pitch might come from  e.g.
    alloc,pitch  = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32

    :param tex_reference: 2D texture reference
    :param shape: shape of the array to be placed there
    :param pitch: pitch parameter for CUDA texture binding
    :param alloc: address
    :return:
    """
    assert (pitch % 8) == 0  # for float types
    descr = cuda.ArrayDescriptor()
    descr.format = cuda.array_format.FLOAT
    descr.height = shape[0]
    descr.width = shape[1]
    descr.num_channels = 1
    tex_ref.set_address_2d(alloc, descr, pitch)
    tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
    tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
    tex_ref.set_filter_mode(cuda.filter_mode.POINT)
    tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
Пример #6
0
def mkdsc(dim, ch):
    return argset(cuda.ArrayDescriptor(),
                  height=dim.ah,
                  width=dim.astride,
                  num_channels=ch,
                  format=cuda.array_format.FLOAT)
Пример #7
0
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"):
    """
	Resample SWARM spectrum product in time-domain at R2DBE rate using
	iFFT and then interpolation in the time-domain.
	
	Arguments:
	----------
	Xs -- MxN numpy array in which the zeroth dimension is increasing
	snapshot index, and the first dimension is the positive frequency
	half of the spectrum.
	interp_kind -- Kind of interpolation.
	
	Returns:
	--------
	xs -- The time-domain signal sampled at the R2DBE rate.
	"""
    # timestep sizes for SWARM and R2DBE rates
    dt_s = 1.0 / SWARM_RATE
    dt_r = 1.0 / R2DBE_RATE

    # cuFFT plan for complex to real DFT
    plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32,
                       Xs.shape[0])

    # load complex spectrum to device
    x_d = gpuarray.to_gpu(Xs)
    xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64)

    # pad nyquist with zeros
    block = (32, 32, 1)
    grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])),
            int(ceil(1. * Xs.shape[0] / block[0])))
    fill_padded = mod.get_function("fill_padded")
    fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\
     block=block,grid=grid)

    # allocate memory for time series
    xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32)

    # calculate time series, include scaling
    cu_fft.ifft(xp_d, xf_d, plan, scale=True)

    # and interpolate
    xs_size = int(floor(
        Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1
    TPB = 64  # threads per block
    nB = int(ceil(1. * xs_size / TPB))  # number of blocks
    xs_d = gpuarray.empty(xs_size, float32)  # decimated time-series
    if interp_kind == 'nearest':
        # compile kernel
        nearest_interp = mod.get_function(interp_kind)
        # call kernel
        nearest_interp(xf_d,
                       xs_d,
                       int32(xs_size),
                       float64(dt_r / dt_s),
                       block=(TPB, 1, 1),
                       grid=(nB, 1))
    elif interp_kind == 'linear':
        # compile kernel
        linear_interp = mod.get_function("copy_texture_kernel")
        # get texture reference
        a_texref = mod.get_texref("a_tex")
        a_texref.set_filter_mode(drv.filter_mode.LINEAR)  # linear
        #a_texref.set_filter_mode(drv.filter_mode.POINT)	# nearest-neighbor
        # move time series to texture reference
        # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html
        descr = drv.ArrayDescriptor()
        descr.format = drv.array_format.FLOAT
        descr.height = Xs.shape[0]
        descr.width = SWARM_SAMPLES_PER_WINDOW
        descr.num_channels = 1
        a_texref.set_address_2d(xf_d.gpudata, descr,
                                SWARM_SAMPLES_PER_WINDOW * 4)
        # set up linear interpolation over texture
        linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\
          texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1))

    return xs_d.get()
Пример #8
0
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False):
  ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py'''
  # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags
#   if order != "C": raise LogicError("Just implemented for C order")
  dimension = len(npArray.shape)
  case = order in ["C","F"]
  if not case:
    raise LogicError("order must be either F or C")
#   if dimension == 1:
#       w = npArray.shape[0]
#       h, d = 0,0
  if dimension == 2:
      if order == "C": stride = 0
      if order == "F": stride = -1
      h, w = npArray.shape
      d = 1
      if allowSurfaceBind:
        descrArr = cuda.ArrayDescriptor3D()
        descrArr.width = w
        descrArr.height = h
        descrArr.depth = d
      else:
        descrArr = cuda.ArrayDescriptor()
        descrArr.width = w
        descrArr.height = h
#         descrArr.depth = d
  elif dimension == 3:
      if order == "C": stride = 1
      if order == "F": stride = 1
      d, h, w = npArray.shape
      descrArr = cuda.ArrayDescriptor3D()
      descrArr.width = w
      descrArr.height = h
      descrArr.depth = d
  else:
      raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ")
  if prec == 'float':
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1
  elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im)
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
    descrArr.num_channels = 2
  elif prec == 'double': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
    descrArr.num_channels = 2
  elif prec == 'cdouble': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
    descrArr.num_channels = 4
  else:
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1

  if allowSurfaceBind:
    if dimension==2:  descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED
    descrArr.flags |= cuda.array3d_flags.SURFACE_LDST

  cudaArray = cuda.Array(descrArr)
  if allowSurfaceBind or dimension==3 :
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
#     if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
#     if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
  else:
#     if dimension == 3:
#       copy3D = cuda.Memcpy3D()
#       copy3D.set_src_host(npArray)
#       copy3D.set_dst_array(cudaArray)
#       copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
# #       if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
# #       if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
#       copy3D.src_height = copy3D.height = h
#       copy3D.depth = d
#       copy3D()
#       return cudaArray, copy3D
#     if dimension == 2:
      cudaArray = cuda.Array(descrArr)
      copy2D = cuda.Memcpy2D()
      copy2D.set_src_host(npArray)
      copy2D.set_dst_array(cudaArray)
      copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride]
#       copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support
      copy2D.src_height = copy2D.height = h
      copy2D(aligned=True)
      return cudaArray, copy2D
Пример #9
0
    def update(self, depth, rgb_img=None):

        # Compute the real world depths.
        # TODO: Determine the best block size.
        depth_gpu = gpuarray.to_gpu(np.float32(depth))
        width = depth_gpu.shape[1]
        height = depth_gpu.shape[0]
        gridx = (width - 1) // 16 + 1
        gridy = (height - 1) // 16 + 1
        pitch = depth_gpu.strides[0]
        self.compute_depth(depth_gpu,
                           np.int32(width),
                           np.int32(height),
                           np.intp(pitch),
                           block=(16, 16, 1),
                           grid=(gridx, gridy))

        # Prepare the depth array to be accessed as a texture.
        descr = drv.ArrayDescriptor()
        descr.width = width
        descr.height = height
        descr.format = drv.array_format.FLOAT
        descr.num_channels = 1
        self.depth_texture.set_address_2d(depth_gpu.gpudata, descr, pitch)

        # Smooth depth.
        # pitch = self.smooth_depth_gpu.strides[0]
        # self.compute_smooth_depth(self.smooth_depth_gpu, np.int32(width), np.int32(height),
        #                     np.intp(pitch), np.float32(10.0), np.float32(10000.0),
        #                     block=(16,16,1), grid=(gridx, gridy))
        #self.smooth_depth_texture.set_address_2d(depth_gpu.gpudata, descr, pitch)

        # Buffer mapping.
        normals_pitch = 640 * 12
        vertex_measure_map, normal_measure_map = map(methodcaller("map"),
                                                     self.buffers["measure"])
        vertices_measure = np.intp(vertex_measure_map.device_ptr_and_size()[0])
        normals_measure = np.intp(normal_measure_map.device_ptr_and_size()[0])
        vertex_raycast_map, normal_raycast_map = map(methodcaller("map"),
                                                     self.buffers["raycast"])
        vertices_raycast = np.intp(vertex_raycast_map.device_ptr_and_size()[0])
        normals_raycast = np.intp(normal_raycast_map.device_ptr_and_size()[0])

        # Measure
        self.measure(vertices_measure,
                     normals_measure,
                     self.mask_gpu,
                     np.int32(width),
                     np.int32(height),
                     np.intp(normals_pitch),
                     block=(16, 16, 1),
                     grid=(gridx, gridy))

        # Update the reconstruction.
        grid2 = int((self.side - 1) // 8 + 1)
        for i in xrange(0, self.side, 8):
            self.update_reconstruction(self.F_gpu,
                                       self.W_gpu,
                                       normals_measure,
                                       np.intp(normals_pitch),
                                       np.int32(self.side),
                                       np.float32(self.units_per_voxel),
                                       np.float32(self.mu),
                                       np.int32(i),
                                       self.T_gk_gpu,
                                       block=(8, 8, 8),
                                       grid=(grid2, grid2))

        # Copy F from gpu to F_array (binded to F_texture).
        self.F_gpu_to_array_copy()

        # Raycast.
        bbox = self.get_bounding_box()
        point = self.T_gk[:3, 3]
        mindistance = distance_to_bbox(bbox, point)
        maxdistance = distance_farthest_to_bbox(bbox, point)
        self.raycast(vertices_raycast,
                     normals_raycast,
                     np.int32(width),
                     np.int32(height),
                     np.intp(normals_pitch),
                     np.int32(self.side),
                     np.float32(self.units_per_voxel),
                     np.float32(self.mu),
                     self.T_gk_gpu,
                     np.float32(mindistance),
                     np.float32(maxdistance),
                     block=(16, 16, 1),
                     grid=(gridx, gridy))

        # Tracking.
        # __global__ void compute_tracking_matrices(float* AA, float* Ab, float* omega,
        #                         float3* vertices_measure, float3* normals_measure,
        #                         float3* vertices_raycast, float3* normals_raycast,
        #                         int width, int height, size_t A_pitch,
        #                         float* mask, float* Tgk, float* Tgk1_k,
        #                         float threshold_distance)
        if self.active_tracking:
            self.AA_gpu.fill(0)
            self.Ab_gpu.fill(0)
            self.compute_tracking_matrices(self.AA_gpu,
                                           self.Ab_gpu,
                                           self.omega_gpu,
                                           vertices_measure,
                                           normals_measure,
                                           vertices_raycast,
                                           normals_raycast,
                                           np.int32(width),
                                           np.int32(height),
                                           np.intp(self.AA_gpu.strides[0]),
                                           np.intp(self.Ab_gpu.strides[0]),
                                           self.mask_gpu,
                                           self.T_gk_gpu,
                                           self.Tgk1_k_gpu,
                                           np.float32(20.0),
                                           block=(16, 16, 1),
                                           grid=(gridx, gridy))

            cudareduce.add_vectors(self.AA_gpu, 640 * 480, 21)
            cudareduce.add_vectors(self.Ab_gpu, 640 * 480, 6)
            drv.memcpy_dtoh(self.AA, self.AA_gpu.gpudata)
            drv.memcpy_dtoh(self.Ab, self.Ab_gpu.gpudata)

            # Solve the system.
            AA = np.zeros((6, 6))
            AA[np.triu_indices(6)] = self.AA
            AA.T[np.triu_indices(6)] = self.AA
            try:
                x = np.linalg.solve(AA, self.Ab)
                Tinc = np.array([[1, x[2], -x[1],
                                  x[3]], [-x[2], 1, x[0], x[4]],
                                 [x[1], -x[0], 1, x[5]], [0, 0, 0, 1]])
                U, D, V = np.linalg.svd(Tinc[:3, :3])
                Tinc[:3, :3] = np.dot(U, V)
            except np.linalg.LinAlgError:
                Tinc = np.eye(4)

            self.T_gk = np.float32(np.dot(Tinc, self.T_gk))
            self.T_gk_gpu = gpuarray.to_gpu(self.T_gk[:3])

        vertex_raycast_map.unmap()
        normal_raycast_map.unmap()
        vertex_measure_map.unmap()
        normal_measure_map.unmap()