예제 #1
0
파일: layer.py 프로젝트: smessing/striate
  def __init__(self , name, filter_shape, image_shape,  padding = 2, stride = 1, initW = 0.01, initB =
      0.0, epsW = 0.001, epsB = 0.002, bias = None, weight = None):
    Layer.__init__(self, name, 'conv')

    self.filterSize = filter_shape[2]
    self.numFilter = filter_shape[0]
    self.imgShape = image_shape

    self.batchSize, self.numColor, self.imgSize, _ = image_shape
    self.padding = padding
    self.stride = stride
    self.initW = initW
    self.initB = initB
    self.epsW = epsW
    self.epsB = epsB

    self.outputSize = 1 + int(((2 * self.padding + self.imgSize - self.filterSize) / float(self.stride)))
    self.modules = self.outputSize ** 2

    if weight is None:
      self.filter = gpuarray.to_gpu(np.random.randn(self.filterSize * self.filterSize *
        self.numColor, self.numFilter) * self.initW).astype(np.float32)
    else:
      self.filter = gpuarray.to_gpu(weight).astype(np.float32)

    if bias is None:
      self.bias = gpuarray.to_gpu(np.random.randn(self.numFilter, 1) * initB).astype(np.float32)
    else:
      self.bias = gpuarray.to_gpu(bias).astype(np.float32)

    self.filterGrad = gpuarray.zeros_like(self.filter)
    self.biasGrad = gpuarray.zeros_like(self.bias)
예제 #2
0
 def __init__(self, bend_coefs, N, QN, NON, NR, x_nd, K_nn, rot_coef, 
              QN_gpu = None, WQN_gpu = None, NON_gpu = None, NHN_gpu = None):
     for b in bend_coefs:
         assert b in NON, 'no solver found for bending coefficient {}'.format(b)
     self.rot_coef = rot_coef
     self.n, self.d  = x_nd.shape
     self.bend_coefs = bend_coefs
     self.N          = N
     self.QN         = QN        
     self.NON        = NON
     self.NR         = NR
     self.x_nd       = x_nd
     self.K_nn       = K_nn
     ## set up GPU memory
     if QN_gpu is None:
         self.QN_gpu = gpuarray.to_gpu(self.QN)
     else:
         self.QN_gpu = QN_gpu
     if WQN_gpu is None:            
         self.WQN_gpu = gpuarray.zeros_like(self.QN_gpu)
     else:
         self.WQN_gpu = WQN_gpu
     if NON_gpu is None:            
         self.NON_gpu = {}
         for b in bend_coefs:
             self.NON_gpu[b] = gpuarray.to_gpu(self.NON[b])
     else:
         self.NON_gpu = NON_gpu
     if NHN_gpu is None:            
         self.NHN_gpu = gpuarray.zeros_like(self.NON_gpu[bend_coefs[0]])
     else:
         self.NHN_gpu = NHN_gpu
     self.valid = True
예제 #3
0
파일: layer.py 프로젝트: wqren/striate
    def __init__(self,
                 name,
                 type,
                 epsW,
                 epsB,
                 initW,
                 initB,
                 momW,
                 momB,
                 wc,
                 weight,
                 bias,
                 weightIncr,
                 biasIncr,
                 weightShape,
                 biasShape,
                 disableBprop=False):
        Layer.__init__(self, name, type, disableBprop)

        self.epsW = F(epsW)
        self.epsB = F(epsB)
        self.initW = initW
        self.initB = initB
        self.momW = F(momW)
        self.momB = F(momB)
        self.wc = F(wc)

        if weight is None:
            self.weight = gpuarray.to_gpu(
                randn(weightShape, np.float32) * self.initW)
        else:
            print >> sys.stderr, 'init weight from disk'
            self.weight = gpuarray.to_gpu(weight)  #.astype(np.float32)

        if bias is None:
            if self.initB > 0.0:
                self.bias = gpuarray.to_gpu(
                    (np.ones(biasShape, dtype=np.float32) * self.initB))
            else:
                self.bias = gpuarray.zeros(biasShape, dtype=np.float32)
        else:
            print >> sys.stderr, 'init bias from disk'
            self.bias = gpuarray.to_gpu(bias).astype(np.float32)

        self.weightGrad = gpuarray.zeros_like(self.weight)
        self.biasGrad = gpuarray.zeros_like(self.bias)
        if self.momW > 0.0:
            if weightIncr is None:
                self.weightIncr = gpuarray.zeros_like(self.weight)
            else:
                print >> sys.stderr, 'init weightIncr from disk'
                #weightIncr = np.require(weightIncr, dtype = np.float, requirements = 'C')
                self.weightIncr = gpuarray.to_gpu(weightIncr)
        if self.momW > 0.0:
            if biasIncr is None:
                self.biasIncr = gpuarray.zeros_like(self.bias)
            else:
                print >> sys.stderr, 'init biasIncr from disk'
                #biasIncr = np.require(biasIncr, dtype = np.float, requirements = 'C')
                self.biasIncr = gpuarray.to_gpu(biasIncr)
예제 #4
0
파일: algo.py 프로젝트: uncbiag/PStrip
def computeEnergy(D_v, S, T, _Lambda, _gamma_c, Alpha, Beta):
    l, m, n = S.shape

    sum_alpha_beta = gpuarray.zeros_like(D_v)
    sk_linalg.dot(Beta, Alpha, out=sum_alpha_beta)

    GR = grad(T)
    square_matrix(GR, GR)
    G_norm = gpuarray.zeros_like(T)
    sum_three_matrix(GR[0, :, :, :], GR[1, :, :, :], GR[2, :, :, :], G_norm,
                     1.0, 1.0, 1.0)
    sqrt_matrix(G_norm, G_norm)
    #    multiply_matrix(G_norm, _Gamma, G_norm)
    ET = _gamma_c * gpuarray.sum(G_norm)

    SP = gpuarray.zeros_like(S)
    absolute_matrix(S, SP)
    multiply_matrix(SP, _Lambda, SP)
    ES = gpuarray.sum(SP)

    sparse = D_v - S.reshape(l * m * n, 1) - T.reshape(l * m * n,
                                                       1) - sum_alpha_beta
    square_matrix(sparse, sparse)
    EL = gpuarray.sum(sparse)

    E = 1 / 2 * EL.get() + ES.get() + ET.get()

    return EL.get(), ES.get(), ET.get(), E
예제 #5
0
    def __init__(self, volume, template, mask, wedge, stdV, gpu=True):
        self.volume = gu.to_gpu(volume)

        self.template = Volume(template)
        self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32)

        self.mask = Volume(mask)
        self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32)
        self.sOrg = mask.shape
        self.sPad = volume.shape
        print(self.sPad, self.sOrg)
        rotate(self.mask, [0, 0, 0], self.maskPadded, self.sPad, self.sOrg)
        #paste_in_center_gpu(self.template.d_data, self.templatePadded, np.int32(self.sPad), np.int32(self.maskSize), block=(10, 10, 10), grid=(8,1,1))
        #rotate(self.template, [0, 0, 0], self.templatePadded, self.sPad, self.maskSize)
        print(volume.shape, stdV.shape, wedge.shape)
        self.wedge = gu.to_gpu(wedge)
        self.stdV = gu.to_gpu(stdV)

        self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64)
        self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype)

        self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64)
        self.template_fft = gu.zeros_like(self.volume, dtype=np.complex64)

        self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32)
        self.norm_volume = np.prod(volume.shape)

        self.scores = gu.ones_like(self.volume, dtype=np.float32) * -1000
        self.angles = gu.ones_like(self.volume, dtype=np.float32) * -1000
        self.p = sum(self.mask.d_data)
예제 #6
0
  def _init_weights(self, weight_shape, bias_shape):
    if self.weight is None:
      if self.name == 'noise':
        assert(weight_shape[0] == weight_shape[1])
        self.weight = gpuarray.to_gpu(np.eye(weight_shape[0], dtype = np.float32))
      else:
        self.weight = gpuarray.to_gpu(randn(weight_shape, np.float32) * self.initW)

    if self.bias is None:
      if self.initB > 0.0:
        self.bias = gpuarray.to_gpu((np.ones(bias_shape, dtype=np.float32) * self.initB))
      else:
        self.bias = gpuarray.zeros(bias_shape, dtype=np.float32)

    Assert.eq(self.weight.shape, weight_shape) 
    Assert.eq(self.bias.shape, bias_shape) 
    
    self.weightGrad = gpuarray.zeros_like(self.weight)
    self.biasGrad = gpuarray.zeros_like(self.bias)
    
    if self.momW > 0.0:
      if self.weightIncr is None:
        self.weightIncr = gpuarray.zeros_like(self.weight)
      if self.biasIncr is None:
        self.biasIncr = gpuarray.zeros_like(self.bias)
      
      Assert.eq(self.weightIncr.shape, weight_shape) 
      Assert.eq(self.biasIncr.shape, bias_shape)
예제 #7
0
    def _init_weights(self, weight_shape, bias_shape):
        if self.weight is None:
            if self.name == 'noise':
                assert (weight_shape[0] == weight_shape[1])
                self.weight = gpuarray.to_gpu(
                    np.eye(weight_shape[0], dtype=np.float32))
            else:
                self.weight = gpuarray.to_gpu(
                    randn(weight_shape, np.float32) * self.initW)

        if self.bias is None:
            if self.initB > 0.0:
                self.bias = gpuarray.to_gpu(
                    (np.ones(bias_shape, dtype=np.float32) * self.initB))
            else:
                self.bias = gpuarray.zeros(bias_shape, dtype=np.float32)

        Assert.eq(self.weight.shape, weight_shape)
        Assert.eq(self.bias.shape, bias_shape)

        self.weightGrad = gpuarray.zeros_like(self.weight)
        self.biasGrad = gpuarray.zeros_like(self.bias)

        if self.momW > 0.0:
            if self.weightIncr is None:
                self.weightIncr = gpuarray.zeros_like(self.weight)
            if self.biasIncr is None:
                self.biasIncr = gpuarray.zeros_like(self.bias)

            Assert.eq(self.weightIncr.shape, weight_shape)
            Assert.eq(self.biasIncr.shape, bias_shape)
예제 #8
0
 def dataToGPU(self):
     # Allocate device memory and copy host to device
     self.start_gpu_time.record()
     self.d_points = gpu.to_gpu(
         self.points.reshape(-1).view(gpu.vec.float3))
     self.d_discPoints = gpu.zeros(
         shape=int(self.nPoints + self.nQueries),
         dtype=gpu.vec.int3)  # dicretized location of points
     self.d_floodMap = gpu.empty(shape=int(self.mapLength * self.mapLength),
                                 dtype=np.int32)  # 1+JFA map
     self.d_floodMap.fill(np.int32(-1))  # initialize to -1
     self.d_tempMap = gpu.zeros_like(
         self.d_floodMap)  # swap memory for floodMap
     self.d_tempMap.fill(np.int32(-1))  # initialize to -1
     self.d_queryMap = gpu.zeros_like(
         self.d_floodMap)  # query (interpolation) map
     self.d_queryMap.fill(np.int32(-1))  # initialize to -1
     self.d_queryValues = gpu.zeros(
         shape=int(self.nQueries),
         dtype=gpu.vec.float2)  # for calculating stolen area
     self.d_colors = gpu.zeros(shape=int(self.nPoints + self.nQueries),
                               dtype=gpu.vec.uchar3)  # color map
     self.d_voronoi = gpu.zeros(
         shape=int(self.mapLength * self.mapLength),
         dtype=gpu.vec.uchar3)  # rendered Voronoi image
     self.end_gpu_time.record()
     self.end_gpu_time.synchronize()
     self.gpu_transfer_time += self.start_gpu_time.time_till(
         self.end_gpu_time) * 1e-3
예제 #9
0
파일: batchtps.py 프로젝트: rll/lfd
    def add_cld(
        self,
        name,
        proj_mats,
        offset_mats,
        cloud_xyz,
        kernel,
        scale_params,
        r_traj,
        r_traj_K,
        l_traj,
        l_traj_K,
        update_ptrs=False,
    ):
        """
        does the normal add, but also adds the trajectories
        """
        # don't update ptrs there, do it after this
        GPUContext.add_cld(self, name, proj_mats, offset_mats, cloud_xyz, kernel, scale_params, update_ptrs=False)
        self.r_traj.append(gpu_pad(r_traj, (MAX_TRAJ_LEN, DATA_DIM)))
        self.r_traj_K.append(gpu_pad(r_traj_K, (MAX_TRAJ_LEN, MAX_CLD_SIZE)))
        self.l_traj.append(gpu_pad(l_traj, (MAX_TRAJ_LEN, DATA_DIM)))
        self.l_traj_K.append(gpu_pad(l_traj_K, (MAX_TRAJ_LEN, MAX_CLD_SIZE)))

        self.r_traj_w.append(gpuarray.zeros_like(self.r_traj[-1]))
        self.l_traj_w.append(gpuarray.zeros_like(self.l_traj[-1]))

        self.l_traj_dims.append(l_traj.shape[0])
        self.r_traj_dims.append(r_traj.shape[0])

        if update_ptrs:
            self.update_ptrs()
예제 #10
0
파일: layer.py 프로젝트: smessing/striate
  def __init__(self, name, input_shape, n_out, epsW=0.001, epsB=0.002, initW = 0.01, initB = 0.0, weight =
      None, bias = None):
    Layer.__init__(self, name, 'fc')
    self.epsW = epsW
    self.epsB = epsB
    self.initW = initW
    self.initB = initB
    
    self.inputShape = input_shape
    self.inputSize, self.batchSize = input_shape
    
    self.outputSize = n_out

    self.weightShape = (self.outputSize, self.inputSize)
    if weight is None:
      self.weight = gpuarray.to_gpu(np.random.randn(*self.weightShape) *
          self.initW).astype(np.float32)
    else:
      self.weight = gpuarray.to_gpu(weight).astype(np.float32)

    if bias is None:
      self.bias = gpuarray.to_gpu(np.random.randn(self.outputSize, 1) *
          self.initB).astype(np.float32)
    else:
      self.bias = gpuarray.to_gpu(bias).astype(np.float32)
    self.weightGrad = gpuarray.zeros_like(self.weight)
    self.biasGrad = gpuarray.zeros_like(self.bias)
예제 #11
0
    def __init__(self,
                 operator,
                 data,
                 u,
                 v,
                 tau,
                 inner_iters,
                 relative_tolerance=1e-20,
                 absolute_tolerance=1e-19,
                 verbose=0,
                 EHs=None):
        self._data = data
        self._op = operator
        self._iters = inner_iters
        self._tau = tau
        self._relative_tolerance = relative_tolerance
        self._absolute_tolerance = absolute_tolerance
        src_shape = (self._data.nX1, self._data.nX2, 1)
        self._dest_shape = (self._data.nT, self._data.nC)
        self.converged = False
        self.iteration = 0
        self._verbose = (verbose > 1)

        try:
            recondata_gpu = self._op.dgpu['recondata']
        except NameError:
            recondata_gpu = gpuarray.to_gpu(self._data.recondata)

        # y
        if EHs is None:
            self.EHs = gpuarray.zeros(src_shape, dtype=np.complex64)
            self._op.adjoint(recondata_gpu, self.EHs)
        else:
            self.EHs = EHs

        self._m = u

        self.rhs = gpuarray.zeros_like(self.EHs)
        inner_cg_rhs(self.rhs, u, v, self.EHs, self._tau)

        self._p_k = gpuarray.zeros_like(self.EHs)
        self._v_k = gpuarray.zeros_like(self.EHs)

        self._residual_k = gpuarray_copy(self.rhs)
        self._forward(self._m, self._v_k)  # initial guess

        self._residual_k = self._residual_k - self._v_k
        self._v_k = gpuarray_copy(self._residual_k)

        self._rho_0 = measure(self._v_k, self._residual_k)
        self._rho_k = self._rho_0

        if self._rho_0 <= self._absolute_tolerance:
            if self._verbose:
                print("Already converged!")
            self.converged = True
            self.iteration = 0

        self._p_k = gpuarray_copy(self._v_k)
예제 #12
0
 def rfftn(self):
     # it seems that we can just take half of the original fft
     # in both arr, arrC so that we match what was here originally
     zeros = gpuarray.zeros_like(self.arr) 
     arr = gpuarray.zeros_like(self.arr) 
     arrC = gpuarray.zeros_like(self.arr) 
     self.plan.execute(self.arr, zeros, data_out_re=arr, data_out_im=arrC)
     return CUDAArray(arr, arrC)
예제 #13
0
def same_reduce_multiview(target, vec, num_view):
  block = (target.size, 1, 1)
  grid = (1, 1)
  tmp = gpuarray.zeros_like(target)
  ids = gpuarray.zeros_like(target)
  _same_reduce_multiview_(target, vec, tmp, ids, I(num_view), block = block , grid = grid)
  tmp = tmp.reshape((1, tmp.size))
  res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32))
  add_row_sum_to_vec(res, tmp)

  return res.get()[0, 0]
예제 #14
0
 def __init__(self, volume, template, mask, gpu):
     self.gpu = gpu
     self.volume = gu.to_gpu(volume)
     self.template = Volume(template)
     self.mask = gu.to_gpu(mask)
     self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64)
     self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype)
     self.volume_fft = gu.zeros_like(self.volume, dtype=np.complex64)
     self.template_fft = gu.zeros_like(self.template.d_data, dtype=np.complex64)
     self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32)
     self.norm_volume = np.prod(volume.shape)
     self.scores = gu.zeros_like(self.volume, dtype=np.float32)
     self.angles = gu.zeros_like(self.volume, dtype=np.float32)
예제 #15
0
def _conv3d_sep(data, kz, ky, kx):
    assert data.ndim == 3

    with open(__cudafile__, "r") as f:
        _mod_conv = SourceModule(f.read())
        gpu_conv3d_0 = _mod_conv.get_function("conv3d_axis0")
        gpu_conv3d_1 = _mod_conv.get_function("conv3d_axis1")
        gpu_conv3d_2 = _mod_conv.get_function("conv3d_axis2")

    d_gpu = asgpuarray(data)
    kz_gpu = asgpuarray(kz, np.float32)
    ky_gpu = asgpuarray(ky, np.float32)
    kx_gpu = asgpuarray(kx, np.float32)
    r1_gpu = gpuarray.zeros_like(d_gpu)
    r2_gpu = gpuarray.zeros_like(d_gpu)

    shape = np.asarray(data.shape[::-1], dtype=int3)
    block, grid = grid_kernel_config(gpu_conv3d_0, data.shape)

    gpu_conv3d_0(
        d_gpu,
        kz_gpu,
        r1_gpu,
        shape,
        np.int32(kz.size // 2),
        block=block,
        grid=grid,
        shared=(kz.size * kz.itemsize),
    )
    gpu_conv3d_1(
        r1_gpu,
        ky_gpu,
        r2_gpu,
        shape,
        np.int32(ky.size // 2),
        block=block,
        grid=grid,
        shared=(ky.size * ky.itemsize),
    )
    gpu_conv3d_2(
        r2_gpu,
        kx_gpu,
        r1_gpu,
        shape,
        np.int32(kx.size // 2),
        block=block,
        grid=grid,
        shared=(kx.size * kx.itemsize),
    )

    return r1_gpu
예제 #16
0
 def __init__(self, particle, reference, mask, wedge, maskIsSphere=True):
     import pycuda.gpuarray as gu
     from voltools.volume import Volume
     self.particle = gu.to_gpu(particle)
     self.template = Volume(reference)
     self.wedge = Volume(wedge)
     self.mask = Volume(mask)
     self.mask.d_data = gu.to_gpu(mask)
     self.fwd_plan = Plan(particle.shape, volume.dtype, np.complex64)
     self.inv_plan = Plan(particle.shape, np.complex64, volume.dtype)
     self.volume_fft = gu.zeros_like(self.particle, dtype=np.complex64)
     self.template_fft = gu.zeros_like(self.reference.d_data,
                                       dtype=np.complex64)
     self.ccc_map = gu.zeros_like(self.volume, dtype=np.float32)
    def __init__(self,
                 params,
                 learning_rate_init=0.001,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-8):
        super(AdamOptimizer, self).__init__(params, learning_rate_init)

        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.t = 0
        self.ms = [gpuarray.zeros_like(param) for param in params]
        self.vs = [gpuarray.zeros_like(param) for param in params]
예제 #18
0
def sqrt_normalize_gpu(img):
    global posr, negr, posa, nega, stream
    rgb = gpuarray.to_gpu(img[:, :, :3].copy())
    a = gpuarray.to_gpu(img[:, :, 3].copy())

    if not posr:
        posr = gpuarray.zeros_like(rgb) + 1
        negr = gpuarray.zeros_like(rgb) - 1
        posa = gpuarray.zeros_like(a) + 1
        nega = gpuarray.zeros_like(a) - 1
    rgb = cumath.sqrt(abs(rgb), stream=stream) * gpuarray.if_positive(
        rgb, posr, negr, stream=stream)
    a = cumath.sqrt(abs(a), stream=stream) * gpuarray.if_positive(
        a, posa, nega, stream=stream)
    return normalize_gpu(rgb, a)
예제 #19
0
    def __init__(self, gpu_detector, ndaq=1):
        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
        self.detector_gpu = gpu_detector.detector_gpu
        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu

        self.module = get_cu_module('daq.cu', options=cuda_options, 
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels
예제 #20
0
파일: correlation.py 프로젝트: xmzzaa/PyTom
 def __init__(self, volume, template, gpu):
     self.gpu = gpu
     volume_gpu = gu.to_gpu(volume)
     self.fwd_plan = Plan(volume.shape, volume.dtype, np.complex64)
     self.volume_fft = gu.zeros_like(volume_gpu, dtype=np.complex64)
     fft(volume_gpu, self.volume_fft, self.fwd_plan)
     self.template_fft = gu.zeros_like(volume_gpu, dtype=np.complex64)
     self.ccc_map = gu.zeros_like(volume_gpu, dtype=np.float32)
     self.norm_volume = gu.prod(volume_gpu.shape)
     #self.scores = gu.zeros_like(volume_gpu, dtype=np.float32)
     #self.angles = gu.zeros_like(volume_gpu, dtype=np.float32)
     self.padded_volume = gu.zeros_like(volume_gpu, dtype=np.float32)
     del volume_gpu
     self.inv_plan = Plan(volume.shape, np.complex64, volume.dtype)
     self.template = Volume(template)
예제 #21
0
def tvdenoising3d(data, lamda=15, max_iter=100):
    assert data.ndim == 3

    with open(op.join(__dirname__, "kernels", "tv.cu"), "r") as f:
        _mod_tv = SourceModule(f.read())
        gpu_tv_u = _mod_tv.get_function("update_u")
        gpu_tv_p = _mod_tv.get_function("update_p")

    dsize = np.prod(data.shape)

    f_gpu = asgpuarray(data)
    u_gpu = f_gpu.copy()
    z_gpu = gpuarray.zeros_like(f_gpu)
    y_gpu = gpuarray.zeros_like(f_gpu)
    x_gpu = gpuarray.zeros_like(f_gpu)

    lamda = np.float32(1.0 / lamda)
    # z, y, x = map(np.int32, data.shape)
    shape = np.asarray(data.shape[::-1], dtype=int3)
    mtpb = gpu_tv_u.max_threads_per_block
    block, grid = flat_kernel_config(gpu_tv_u, data.shape)

    for i in range(max_iter):
        tau2 = np.float32(0.3 + 0.02 * i)
        tau1 = np.float32((1.0 / tau2) * ((1.0 / 6.0) - (5.0 / (15.0 + i))))

        gpu_tv_u(
            f_gpu,
            z_gpu,
            y_gpu,
            x_gpu,
            u_gpu,
            tau1,
            lamda,
            shape,
            block=block,
            grid=grid,
        )
        gpu_tv_p(u_gpu,
                 z_gpu,
                 y_gpu,
                 x_gpu,
                 tau2,
                 shape,
                 block=block,
                 grid=grid)

    return u_gpu
예제 #22
0
def create_quantiles(data, params):
    global quantiles, q_lb, q_ub, mask
    sort_gpu(data)

    if mask.shape != data.shape:
        mask = gpuarray.zeros_like(data)

    n_lb = gpuarray.sum(data < mask)
    n_ub = gpuarray.sum(data > mask)

    fill_lb_quantiles(data,
                      quantiles,
                      n_lb,
                      n_ub,
                      q_lb,
                      block=(quantiles.shape[0], 1, 1))
    fill_ub_quantiles(data,
                      quantiles,
                      n_lb,
                      n_ub,
                      q_ub,
                      block=(quantiles.shape[0], 1, 1))
    q_lb = q_lb.reverse()

    p_ub = n_ub / (n_ub + n_lb)

    del n_lb, n_ub

    return data, q_lb.get(), q_ub.get(), probs * (
        1 - p_ub.get()), probs * p_ub.get()
예제 #23
0
def evolve_linear(z, deltax):
    """
	Input type IN must be numpy or 21cmfast
	"""

    fgrowth = pb.fgrowth(z, COSMO['omega_M_0'])  #normalized to 1 at z=0
    #primordial_fgrowth = pb.fgrowth(INITIAL_REDSHIFT, cosmo['omega_M_0']) #normalized to 1 at z=0

    updated = deltax * fgrowth

    np.save(
        parent_folder +
        "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(
            z, HII_DIM, BOX_LEN), updated)

    if False:  #velocity information may not be useful for linear field
        plan = Plan(HII_shape, dtype=np.complex64)
        deltak_d = deltax_d.astype(np.complex64)
        vbox_d = gpuarray.zeros_like(deltak_d)
        plan.execute(deltak_d)
        dDdt_D = np.float32(dDdt_D(z))
        for num, mode in enumerate(['x', 'y', 'z']):
            velocity_kernel(deltak_d,
                            vbox_d,
                            dDdt_D,
                            DIM,
                            np.int32(num),
                            block=block_size,
                            grid=grid_size)
            np.save(
                parent_folder +
                "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format(
                    mode, HII_DIM, BOX_LEN), smallvbox_d.get())

    return
예제 #24
0
 def _compile_kernels(self):
     mod = SourceModule(
         """
         // Extract the upper diagonals of a square (N, N) matrix.
         __global__ void extract_upper_diags(float* matrix, float* diags, int N) {
             int x = blockDim.x * blockIdx.x + threadIdx.x;
             int y = blockDim.y * blockIdx.y + threadIdx.y;
             if ((x >= N) || (y >= N) || (y > x)) return;
             int pos = y*N+x;
             int my_diag = x-y;
             diags[my_diag * N + x] = matrix[pos];
         }
         """
     )
     self.extract_diags_kernel = mod.get_function("extract_upper_diags")
     self._blocks = (32, 32, 1)
     self._grid = (
         updiv(self.nframes, self._blocks[0]),
         updiv(self.nframes, self._blocks[1]),
         1
     )
     self.d_diags = garray.zeros((self.nframes, self.nframes), dtype=np.float32)
     self.d_sumdiags1 = garray.zeros(self.nframes, dtype=np.float32)
     self.d_sumdiags2 = garray.zeros_like(self.d_sumdiags1)
     self._kern_args = [
         None,
         self.d_diags,
         np.int32(self.nframes),
     ]
예제 #25
0
    def _correlate_fft(self, frames_flat, cufft_plan):
        npix = frames_flat.shape[1]

        d_in = cufft_plan.data_in
        d_in.fill(0)
        f_out1 = cufft_plan.data_out
        f_out2 = garray.zeros_like(cufft_plan.data_out)

        # fft(pad(frames_flat), axis=1)
        d_in[:, :self.nframes] = frames_flat.T.astype("f")
        f_out1 = cufft_plan.fft(d_in, output=f_out1)

        # frames_flat.sum(axis=1)
        # skmisc.sum() only works on base data, not gpuarray views,
        # so we sum on the whole array and then extract the right subset.
        skmisc.sum(d_in, axis=0, out=self.d_sums_denom_tmp)

        # fft(pad(frames_flat[::-1]), axis=1)
        d_in.fill(0)
        d_in[:, :self.nframes] = frames_flat.T[:, ::-1].astype("f")
        f_out2 = cufft_plan.fft(d_in, output=f_out2)

        # product, ifft
        f_out1 *= f_out2
        num = cufft_plan.ifft(f_out1, output=d_in)

        # numerator of g_2
        skmisc.sum(num, axis=0, out=self.d_sums)

        # denominator of g_2: correlate(d_sums_denom)
        self._correlate_denom(npix)

        self.d_numerator /= self.d_denom
        res = self.d_numerator.get()
        return res
예제 #26
0
    def execute(self):
        resulting_image = None
        nda = None
        f_first = True

        img_cnt = 0

        for itr_img in self.images_iterator:
            img_cnt += 1

            if f_first:
                nda = np.ndarray(shape=itr_img.image.shape,
                                 dtype=itr_img.image.dtype)

                nda[:] = itr_img.image[:]

                self.resulting_image = itr_img
                resulting_image = gpuarray.to_gpu(nda)

                current_image = gpuarray.zeros_like(resulting_image)
                f_first = False
                shape = itr_img.shape
                continue

            if shape != itr_img.shape:
                img_cnt -= 1
                continue

            current_image.set(itr_img.image)

            resulting_image += current_image

        resulting_image /= img_cnt

        self.resulting_image.image[:] = resulting_image.get()
예제 #27
0
파일: layer.py 프로젝트: phecy/striate
 def fprop(self, input, output, train=TRAIN):
     self.denom = gpuarray.zeros_like(input)
     cudaconv2.convResponseNormCrossMap(input, self.denom, output,
                                        self.numColor, self.size,
                                        self.scaler, self.pow, self.blocked)
     if PFout:
         print_matrix(output, self.name)
예제 #28
0
    def cuda_run(self, prefix, supportK):
        print('Running Eclat in recursive: number of itemsets found:',
              len(self.support_list),
              end='\r')

        while supportK:
            itemset, bitvector = supportK.pop(0)
            support = gpuarray.sum(bitvector).get()

            if support >= self.min_support:
                self.support_list[frozenset(sorted(prefix +
                                                   [itemset]))] = int(support)

                suffix = []
                for itemset_sub, bitvector_sub in supportK:
                    if gpuarray.sum(bitvector_sub).get() >= self.min_support:
                        if self.use_optimal:
                            union_bitvector = bitvector.__mul__(bitvector_sub)
                        else:
                            union_bitvector = gpuarray.zeros_like(bitvector)
                            self.multiply(union_bitvector,
                                          bitvector,
                                          bitvector_sub,
                                          block=self.block,
                                          grid=self.grid)

                        if gpuarray.sum(
                                union_bitvector).get() >= self.min_support:
                            suffix.append((itemset_sub, union_bitvector))

                self.cuda_run(
                    prefix + [itemset],
                    sorted(suffix, key=lambda x: int(x[0]), reverse=True))
예제 #29
0
파일: misc.py 프로젝트: Captricity/sciguppy
def softmax_back(d_a, d_error, s):
    d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_out.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1)
    softmax_back_kernel(d_a, d_error, d_out, numpy.float32(s), numpy.int32(d_out.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
예제 #30
0
파일: ccl.py 프로젝트: kotenev/SuRVoS2
def ccl3d(labels, remap=True):
    assert labels.ndim == 3
    assert labels.dtype == np.uint32

    with open(op.join(__dirname__, 'kernels', 'ccl3d.cu'), 'r') as f:
        _mod_conv = SourceModule(f.read())
        gpu_ccl_local = _mod_conv.get_function('uf_local')
        gpu_ccl_global = _mod_conv.get_function('uf_global')
        gpu_ccl_final = _mod_conv.get_function('uf_final')

    labels_gpu = asgpuarray(labels, dtype=np.uint32)
    result_gpu = gpuarray.zeros_like(labels_gpu)
    shape = np.asarray(tuple(labels.shape[::-1]), dtype=int3)

    block, grid = grid_kernel_config(gpu_ccl_local, labels.shape)
    shared = int(np.prod(block) * 8)

    gpu_ccl_local(labels_gpu,
                  result_gpu,
                  shape,
                  block=block,
                  grid=grid,
                  shared=shared)
    gpu_ccl_global(labels_gpu, result_gpu, shape, block=block, grid=grid)
    gpu_ccl_final(result_gpu, shape, block=block, grid=grid)

    if remap:
        return remap_labels(result_gpu.get())

    return result_gpu
예제 #31
0
def custom_filter_gpu(image, template):
    if not (template.shape[0] == template.shape[1]):
        raise ValueError("Шаблона должен быть квадратным")
    if template.shape[0] % 2 == 0:
        raise ValueError("Сторона шаблона должена быть нечетной")
    filtersize05 = template.shape[0] // 2

    image_gpu = gpuarray.to_gpu(image)

    filtered_image = gpuarray.zeros_like(image_gpu)

    s = template.sum()
    window = gpuarray.to_gpu(
        np.array([coef / s for coef in template.flatten()]))
    shape = filtered_image.shape
    wid = 0

    for i in range(shape[0]):
        for j in range(shape[1]):
            for color in range(shape[2]):
                wid = 0
                for m in range(i - filtersize05, i + filtersize05 + 1):
                    for n in range(j - filtersize05, j + filtersize05 + 1):
                        if 0 <= m and m < shape[0] and 0 <= n and n < shape[1]:
                            filtered_image[i][j][
                                color] += window[wid] * image_gpu[m][n][color]
                        wid += 1

    return filtered_image.get()
예제 #32
0
파일: misc.py 프로젝트: Captricity/sciguppy
def ewsum(d_a, d_w):
    """
    YORI NOTES

    This method is faster than CPU if num_w is large, and non_width is small:
        When num_w is large, the for loop is small
        When non_width is large, there are more threads necessary
    """
    width = d_a.shape[0]
    total_dim = d_a.size
    num_w = d_w.shape[0]
    d_tmp_out = gpuarray.zeros_like(d_a)
    
    thread_size = min(d_a.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1)
    ewsum_kernel(d_a, d_w, d_tmp_out,
            numpy.int32(num_w), numpy.int32(width), numpy.int32(total_dim),
            block=(thread_size,1,1), grid=(block_size,1,1))

    # TODO: There HAS to be a better way to do this
    x = width / num_w
    d_out = gpuarray.zeros((x,) + d_a.shape[1:], numpy.float32)
    thread_size = min(d_out.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1)
    ewsum_sum_kernel(d_tmp_out, d_out,
            numpy.int32(num_w), numpy.int32(width), numpy.int32(total_dim),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
예제 #33
0
 def test_cublasDcopy(self):
     x = np.random.rand(5).astype(np.float64)
     x_gpu = gpuarray.to_gpu(x)
     y_gpu = gpuarray.zeros_like(x_gpu)
     cublas.cublasDcopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1,
                        y_gpu.gpudata, 1)
     assert np.allclose(y_gpu.get(), x_gpu.get())
예제 #34
0
 def setUp(self):
     self.comm = MPI.COMM_WORLD
     self.gpu_comm = GPUComm(MPI.COMM_WORLD)
     self.cpu_send = np.random.rand(*TEST_DIMS).astype(np.float32)
     self.gpu_send = gpu.to_gpu(self.cpu_send)
     self.cpu_recv = np.zeros_like(self.cpu_send)
     self.gpu_recv = gpu.zeros_like(self.gpu_send)
예제 #35
0
파일: daq.py 프로젝트: NuTufts/ChromaUBooNE
    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels
예제 #36
0
def same_reduce_multiview(target, vec, num_view):
    block = (target.size, 1, 1)
    grid = (1, 1)
    tmp = gpuarray.zeros_like(target)
    ids = gpuarray.zeros_like(target)
    _same_reduce_multiview_(target,
                            vec,
                            tmp,
                            ids,
                            I(num_view),
                            block=block,
                            grid=grid)
    tmp = tmp.reshape((1, tmp.size))
    res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32))
    add_row_sum_to_vec(res, tmp)

    return res.get()[0, 0]
예제 #37
0
def exp(d_a, mode=MathModes.ACC):
    if mode == MathModes.ACC:
        return cumath.exp(d_a)

    d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_a.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1)
    exp_fast_kernel(d_a, d_out, numpy.int32(d_a.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
예제 #38
0
파일: layer.py 프로젝트: phecy/striate
  def __init__(self, name, type, epsW, epsB, initW, initB, momW, momB, wc, weight, bias,
      weightIncr , biasIncr, weightShape, biasShape):
    Layer.__init__(self, name, type)

    self.epsW = F(epsW)
    self.epsB = F(epsB)
    self.initW = initW
    self.initB = initB
    self.momW = F(momW)
    self.momB = F(momB)
    self.wc = F(wc)

    if weight is None:
      self.weight = gpuarray.to_gpu(randn(weightShape, np.float32) * self.initW)
    else:
      print >> sys.stderr,  'init weight from disk'
      self.weight = gpuarray.to_gpu(weight)#.astype(np.float32)

    if bias is None:
      if self.initB > 0.0:
        self.bias = gpuarray.to_gpu((np.ones(biasShape, dtype=np.float32) * self.initB))
      else:
        self.bias = gpuarray.zeros(biasShape, dtype=np.float32)
    else:
      print >> sys.stderr,  'init bias from disk'
      self.bias = gpuarray.to_gpu(bias).astype(np.float32)

    self.weightGrad = gpuarray.zeros_like(self.weight)
    self.biasGrad = gpuarray.zeros_like(self.bias)
    if self.momW > 0.0:
      if weightIncr is None:
        self.weightIncr = gpuarray.zeros_like(self.weight)
      else:
        print >> sys.stderr,  'init weightIncr from disk'
        #weightIncr = np.require(weightIncr, dtype = np.float, requirements = 'C')
        self.weightIncr = gpuarray.to_gpu(weightIncr)
    if self.momW > 0.0:
      if biasIncr is None:
        self.biasIncr = gpuarray.zeros_like(self.bias)
      else:
        print >> sys.stderr,  'init biasIncr from disk'
        #biasIncr = np.require(biasIncr, dtype = np.float, requirements = 'C')
        self.biasIncr = gpuarray.to_gpu(biasIncr)
예제 #39
0
파일: misc.py 프로젝트: Captricity/sciguppy
def rectify_back(d_a, d_error, inplace=False):
    if inplace:
        d_out = d_a
    else:
        d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_out.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_out.size / float(thread_size))), 1)
    rectify_back_kernel(d_a, d_error, d_out, numpy.int32(d_out.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
예제 #40
0
파일: batchtps.py 프로젝트: rll/lfd
    def add_cld(self, name, proj_mats, offset_mats, cloud_xyz, kernel, scale_params, update_ptrs=False):
        """
        adds a new cloud to our context for batch processing
        """
        self.check_cld(cloud_xyz)
        self.ptrs_valid = False
        self.N += 1
        self.seg_names.append(name)
        self.names2inds[name] = self.N - 1
        self.tps_params.append(self.default_tps_params.copy())
        self.trans_d.append(self.tps_params[-1][0, :])
        self.lin_dd.append(self.tps_params[-1][1 : DATA_DIM + 1, :])
        self.w_nd.append(self.tps_params[-1][DATA_DIM + 1 :, :])
        self.scale_params.append(scale_params)
        n = cloud_xyz.shape[0]

        for b in self.bend_coefs:
            proj_mat = proj_mats[b]
            offset_mat = offset_mats[b]
            self.proj_mats[b].append(gpu_pad(proj_mat, (MAX_CLD_SIZE + DATA_DIM + 1, MAX_CLD_SIZE)))

            if offset_mat.shape != (n + DATA_DIM + 1, DATA_DIM):
                raise ValueError("Offset Matrix has incorrect dimension")
            self.offset_mats[b].append(gpu_pad(offset_mat, (MAX_CLD_SIZE + DATA_DIM + 1, DATA_DIM)))

        if n > MAX_CLD_SIZE or cloud_xyz.shape[1] != DATA_DIM:
            raise ValueError("cloud_xyz has incorrect dimension")
        self.pts.append(gpu_pad(cloud_xyz, (MAX_CLD_SIZE, DATA_DIM)))
        if kernel.shape != (n, n):
            raise ValueError("dimension mismatch b/t kernel and cloud")
        self.kernels.append(gpu_pad(kernel, (MAX_CLD_SIZE, MAX_CLD_SIZE)))
        self.dims.append(n)

        self.pts_w.append(gpuarray.zeros_like(self.pts[-1]))
        self.pts_t.append(gpuarray.zeros_like(self.pts[-1]))
        self.corr_cm.append(gpuarray.zeros((MAX_CLD_SIZE, MAX_CLD_SIZE), np.float32))
        self.corr_rm.append(gpuarray.zeros((MAX_CLD_SIZE, MAX_CLD_SIZE), np.float32))
        self.r_coefs.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32))
        self.c_coefs_rn.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32))
        self.c_coefs_cn.append(gpuarray.zeros((MAX_CLD_SIZE, 1), np.float32))

        if update_ptrs:
            self.update_ptrs()
예제 #41
0
    def map_elementwise_max(self, op, field_expr):
        field = self.rec(field_expr)
        field_out = gpuarray.zeros_like(field)

        func_rec = self.executor.get_elwise_max_kernel(field.dtype)

        func_rec.func.prepared_call((func_rec.grid_dim, 1), field.gpudata,
                                    field_out.gpudata, func_rec.mb_count)

        return field_out
예제 #42
0
파일: execute.py 프로젝트: felipeh/hedge
    def map_elementwise_max(self, op, field_expr):
        field = self.rec(field_expr)
        field_out = gpuarray.zeros_like(field)

        func_rec = self.executor.get_elwise_max_kernel(field.dtype)

        func_rec.func.prepared_call((func_rec.grid_dim, 1),
            field.gpudata, field_out.gpudata, func_rec.mb_count)

        return field_out
예제 #43
0
def expit_back(d_a, d_error):
    """Implments the following function

    out = in * (1 - in) * error
    """
    d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_a.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1)
    expit_back_kernel(d_a, d_error, d_out, numpy.int32(d_a.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
def robust_pca(D):
    """ 
    Parrallel RPCA using ALM, adapted from https://github.com/nwbirnie/rpca.
    Takes and returns numpy arrays
    """
    M = gpuarray.to_gpu(D)
    L = gpuarray.zeros_like(M)
    S = gpuarray.zeros_like(M)    
    Y = gpuarray.zeros_like(M)
    print M.shape

    mu = (M.shape[0] * M.shape[1]) / (4.0 * L1Norm(M))
    lamb = max(M.shape) ** -0.5

    while not converged(M, L, S):
        L = svd_shrink(M - S - (mu**-1) * Y, mu)
        S = shrink(M - L + (mu**-1) * Y, lamb * mu)
        Y = Y + mu * (M - L - S)

    return L.get(), S.get()
예제 #45
0
    def test_2d_fp_surfaces(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints)[:]
            A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized

            myKernRW = '''
            #include <pycuda-helpers.hpp>

            surface<void, cudaSurfaceType2DLayered> mtx_tex;

            __global__ void copy_texture(cuPres *dest, int rw)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int layer = 1;
              int tid = row + col*blockDim.x*gridDim.x ;
              if (rw==0){
              cuPres aux = dest[tid];
              fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);}
              else {
              cuPres aux = 0;
              fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp);
              dest[tid] = aux;
              }
            }
            '''
            myKernRW = myKernRW.replace('fpName',fpName_str)
            myKernRW = myKernRW.replace('cuPres',prec_str)
            modW = SourceModule(myKernRW)

            copy_texture = modW.get_function("copy_texture")
            mtx_tex = modW.get_surfref("mtx_tex")
            cuBlock = (8,8,1)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,1)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
            copy_texture.prepare('Pi')#,texrefs=[mtx_tex])
            A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros
            cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True)
            A_cpu = A_gpu.get() # To remember original array
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
예제 #46
0
def gpu_fft(data, inverse=False):
    global plan, ctx, stream  ##cuda
    if not plan:
        print 'building plan', data.shape
        plan = Plan(data.shape, stream=stream, wait_for_finish=True)

    result = gpuarray.zeros_like(data)

    plan.execute(data, data_out=result, inverse=inverse)

    return result
예제 #47
0
def expit(d_a, mode=MathModes.ACC):
    """Implements the expit function (aka sigmoid)

    expit(x) = 1 / (1 + exp(-x))
    """
    d_out = gpuarray.zeros_like(d_a)
    thread_size = min(d_a.size, MAX_BLOCK_SIZE)
    block_size = max(int(math.ceil(d_a.size / float(thread_size))), 1)
    kernel = expit_fast_kernel if mode == MathModes.FAST else expit_kernel
    kernel(d_a, d_out, numpy.int32(d_a.size),
            block=(thread_size,1,1), grid=(block_size,1,1))
    return d_out
예제 #48
0
    def __init__(self, mesh, context=None):
        '''
        Args:
            mesh The mesh on which the solver will operate. The dimensionality
                 is deducted from mesh.dimension
        '''
        # create the mesh grid and compute the greens function on it
        self.mesh = mesh
        self._context = context
        mesh_shape = self.mesh.shape # nz, ny, (nx)
        mesh_shape2 = [2*n for n in mesh_shape] # 2*nz, 2*ny, (2*nx)
        mesh_distances = list(reversed(self.mesh.distances)) #dz, dy, dx
        self.fgreentr = gpuarray.empty(mesh_shape2,
                        dtype=np.complex128)
        self.tmpspace = gpuarray.zeros_like(self.fgreentr)
        sizeof_complex = np.dtype(np.complex128).itemsize

        # dimensionality function dispatch
        dim = self.mesh.dimension
        self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd')
        self._mirror = getattr(self, '_mirror' + str(dim) + 'd')
        copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d}
        memcpy_nd = copy_fn[str(dim) + 'd']
        dim_args = self.mesh.shape
        self._cpyrho2tmp = memcpy_nd(
            src=None, dst=self.tmpspace, # None because src(rho) not yet known
            src_pitch=self.mesh.nx*sizeof_complex,
            dst_pitch=2*self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=self.mesh.ny,
            dst_height=2*self.mesh.ny)
        self._cpytmp2rho = memcpy_nd(
            src=self.tmpspace, dst=None, # None because dst(rho) not yet know
            src_pitch=2*self.mesh.nx*sizeof_complex,
            dst_pitch=self.mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=2*self.mesh.ny,
            dst_height=self.mesh.ny)
        mesh_arr = [-mesh_distances[i]/2 + np.arange(mesh_shape[i]+1)
                                            * mesh_distances[i]
                    for i in xrange(self.mesh.dimension)
                   ]
        # mesh_arr is [mz, my, mx]
        mesh_grids = np.meshgrid(*mesh_arr, indexing='ij')
        fgreen = self._fgreen(*mesh_grids)
        fgreen = self._mirror(fgreen)
        self.plan_forward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128,
                                        out_dtype=np.complex128)
        self.plan_backward = cu_fft.Plan(self.tmpspace.shape, in_dtype=np.complex128,
                                         out_dtype=np.complex128)
        cu_fft.fft(gpuarray.to_gpu(fgreen), self.fgreentr, plan=self.plan_forward)
예제 #49
0
파일: layer.py 프로젝트: altus88/striate
  def __init__(self, name, type, epsW, epsB, initW, initB, weight, bias, weightShape, biasShape):
    Layer.__init__(self, name, type)

    self.epsW = epsW
    self.epsB = epsB
    self.initW = initW
    self.initB = initB

    if weight is None:
      self.weight = gpuarray.to_gpu(np.random.randn(*weightShape) *
          self.initW).astype(np.float32)
    else:
      self.weight = gpuarray.to_gpu(weight).astype(np.float32)

    if bias is None:
      self.bias = gpuarray.to_gpu(np.random.randn(*biasShape) *
          self.initB).astype(np.float32)
    else:
      self.bias = gpuarray.to_gpu(bias).astype(np.float32)
    self.weightGrad = gpuarray.zeros_like(self.weight)
    self.biasGrad = gpuarray.zeros_like(self.bias)
예제 #50
0
def same_reduce(target, vec):
  '''
  Return the number of same values in the same offset of two vecs
  '''
  block = (target.size, 1, 1)
  grid = (1, 1)
  tmp = gpuarray.zeros_like(target)
  _same_reduce_(target, vec, tmp, block=block, grid=grid)
  tmp.shape = (1, tmp.size)
  res = gpuarray.to_gpu(np.zeros((1, 1)).astype(np.float32))
  add_row_sum_to_vec(res, tmp)

  return int(res.get()[0, 0])
예제 #51
0
 def test_cublasZgeam(self):
     a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128)
     b = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128)
     a_gpu = gpuarray.to_gpu(a.copy())
     b_gpu = gpuarray.to_gpu(b.copy())
     c_gpu = gpuarray.zeros_like(a_gpu)
     alpha = np.complex128(np.random.rand()+1j*np.random.rand())
     beta = np.complex128(np.random.rand()+1j*np.random.rand())
     cublas.cublasZgeam(self.cublas_handle, 'n', 'n', 2, 3,
                        alpha, a_gpu.gpudata, 2,
                        beta, b_gpu.gpudata, 2,
                        c_gpu.gpudata, 2)
     assert np.allclose(c_gpu.get(), alpha*a+beta*b)
예제 #52
0
    def _transform_wf(self, ps, qs):
        result_real_gpu = gpuarray.zeros(N.broadcast(ps, qs).shape, N.double)
        result_imag_gpu = gpuarray.zeros_like(result_real_gpu)

        self._kernel.prepared_call(self._gpu_grid, self._gpu_block,
                gpuarray.to_gpu(N.ascontiguousarray(ps)).gpudata,
                gpuarray.to_gpu(N.ascontiguousarray(qs)).gpudata,
                self._wf_q_grid_gpu.gpudata,
                self._wf_gpu.gpudata,
                result_real_gpu.gpudata,
                result_imag_gpu.gpudata,
                )

        return result_real_gpu.get() + 1j * result_imag_gpu.get()
예제 #53
0
    def run(self):
        drv.init()
        a0=numpy.zeros((p,),dtype=numpy.complex64)
        self.dev = drv.Device(self.number)
        self.ctx = self.dev.make_context()
#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
        print drv.mem_get_info() 
        self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
        self.gpu_b = garray.zeros_like(self.gpu_a)
        self.gpu_a = garray.to_gpu(self.input_cpu)
        plan = Plan(a0.shape,context=self.ctx)
        plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
        self.temp = self.gpu_b.get()
        print output_cpu._closed
        self.output_cpu.put(self.temp)
예제 #54
0
    def _transform_wf(self, t):
        result_real_gpu = gpuarray.zeros((len(self._p_grid), len(self._q_grid)), np.double)
        result_imag_gpu = gpuarray.zeros_like(result_real_gpu)

        self._kernel.prepared_call(self._gpu_grid, self._gpu_block,
                                   self._p_grid_gpu.gpudata,
                                   self._q_grid_gpu.gpudata,
                                   self._wf_q_grid_gpu.gpudata,
                                   self._wfs_gpu.gpudata,
                                   self._energies_gpu.gpudata,
                                   t,
                                   result_real_gpu.gpudata,
                                   result_imag_gpu.gpudata,
                                   )

        return result_real_gpu.get() + 1j * result_imag_gpu.get()
예제 #55
0
    def backprop(self, input_data, targets, cache=None):
        df_input = gpuarray.zeros_like(input_data)

        if cache is None: cache = self.n_tasks * [None]

        gradients = []
        for targets_task, cache_task, task, task_weight  in \
          izip(targets, cache, self.tasks, self.task_weights):
            gradients_task, df_input_task = \
              task.backprop(input_data, targets_task,
                            cache_task)

            df_input = df_input.mul_add(1., df_input_task, task_weight)

            gradients.extend(gradients_task)

        return gradients, df_input
예제 #56
0
def wsparsify(w_gpu, percentage):
  """
  Keeps only as many entries nonzero as specified by percentage.
  """

  w    = w_gpu.get()
  vals = sort(w)[::-1]
  idx  = floor(prod(w.shape()) * percentage/100)
  zw_gpu = cua.zeros_like(w_gpu)   # gpu array filled with zeros
  tw_gpu = cua.empty_like(w_gpu)   # gpu array containing threshold
  tw_gpu.fill(vals[idx])        
  w_gpu  = cua.if_positive(w_gpu > tw_gpu, w_gpu, zw_gpu)

  del zw_gpu
  del tw_gpu

  return w_gpu