예제 #1
0
파일: dm10.py 프로젝트: zemlni/quantumsim
    def add_ancilla(self, anc_st):
        """Add an ancilla in the ground or excited state as the highest new bit.
        """

        byte_size_of_smaller_dm = 2**(2 * self.no_qubits) * 8

        if self.allocated_qubits == self.no_qubits:
            # allocate larger memory
            new_dm = ga.zeros(self._size * 4, np.float64)
            offset = anc_st * 3 * byte_size_of_smaller_dm
            drv.memcpy_dtod(int(new_dm.gpudata) + offset,
                                  self.data.gpudata, byte_size_of_smaller_dm)

            self.data = new_dm
        else:
            # reuse previously allocated memory
            if anc_st == 0:
                drv.memset_d8(int(self.data.gpudata) + byte_size_of_smaller_dm,
                              0, 3 * byte_size_of_smaller_dm)
            if anc_st == 1:
                drv.memcpy_dtod(int(self.data.gpudata) + 3 * byte_size_of_smaller_dm,
                                      self.data.gpudata, byte_size_of_smaller_dm)
                drv.memset_d8(self.data.gpudata, 0, 3 *
                              byte_size_of_smaller_dm)

        self._set_no_qubits(self.no_qubits + 1)
예제 #2
0
def stepFunction():
    global animIter
    if showActivity:
        cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks)
        findActivityKernel(cudaPre(1.e-10),
                           concentrationIn_d,
                           activeBlocks_d,
                           grid=grid2D,
                           block=block2D)
        getActivityKernel(activeBlocks_d,
                          activeThreads_d,
                          grid=grid2D,
                          block=block2D)
    cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr,
                     concentrationOut_d.nbytes)
    maxVal = gpuarray.max(plotData_d).get()
    scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity),
                  activeThreads_d)
    if cudaP == "float":
        [oneIteration_tex() for i in range(nIterationsPerPlot)]
    else:
        [oneIteration_sh() for i in range(nIterationsPerPlot // 2)]
    if plotting and animIter % 25 == 0:
        maxVals.append(maxVal)
        sumConc.append(gpuarray.sum(concentrationIn_d).get())
        plotData(maxVals, sumConc)
    animIter += 1
예제 #3
0
파일: LPU.py 프로젝트: yiyin/neurokernel
    def _read_external_input(self):
        # if eof not reached or there are frames in buffer not read
        # copy the input from buffer to synapse state array
        if not self.input_eof or self.frame_count < self.frames_in_buffer:
            cuda.memcpy_dtod(
                int(int(self.synapse_state.gpudata) + self.total_synapses * self.synapse_state.dtype.itemsize),
                int(int(self.I_ext.gpudata) + self.frame_count * self.I_ext.ld * self.I_ext.dtype.itemsize),
                self.num_input * self.synapse_state.dtype.itemsize,
            )
            self.frame_count += 1
        else:
            self.log_info("Input end of file reached. " "Subsequent behaviour is undefined.")
        # if all buffer frames were read, read from file
        if self.frame_count >= self._one_time_import and not self.input_eof:
            input_ld = self.input_h5file.root.array.shape[0]
            if input_ld - self.file_pointer < self._one_time_import:
                h_ext = self.input_h5file.root.array.read(self.file_pointer, input_ld)
            else:
                h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self._one_time_import)
            if h_ext.shape[0] == self.I_ext.shape[0]:
                self.I_ext.set(h_ext)
                self.file_pointer += self._one_time_import
                self.frame_count = 0
            else:
                pad_shape = list(h_ext.shape)
                self.frames_in_buffer = h_ext.shape[0]
                pad_shape[0] = self._one_time_import - h_ext.shape[0]
                h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0)
                self.I_ext.set(h_ext)
                self.file_pointer = input_ld

            if self.file_pointer == self.input_h5file.root.array.shape[0]:
                self.input_eof = True
 def cache_z(self, z):
     x = np.require(z.real, dtype = np.double, requirements = ['A','W','O','C'])
     y = np.require(z.imag, dtype = np.double, requirements = ['A','W','O','C'])
     xd = gpuarray.to_gpu(x)
     yd = gpuarray.to_gpu(y)
     cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes)
     cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
예제 #5
0
    def copy_rows(self, start, stop, step=1):
        nrows = len(range(start, stop, step))
        if nrows:

            if self.ndim == 2:
                shape = (nrows, self.shape[1])
            else:
                shape = (nrows, self.shape[1], self.shape[2])
        else:
            if self.ndim == 2:
                shape = (nrows, 0)
            else:
                shape = (nrows, 0, 0)

        result = PitchArray(shape, self.dtype)

        if nrows > 1:
            PitchTrans(
                shape, result.gpudata, result.ld,
                int(self.gpudata) + start * self.ld * self.dtype.itemsize,
                self.ld * step, self.dtype)
        elif nrows == 1:
            cuda.memcpy_dtod(
                result.gpudata,
                int(self.gpudata) + start * self.ld * self.dtype.itemsize,
                self.dtype.itemsize * _pd(shape))
        return result
 def swapHashTableValues(new_vals):
     table_vals, table_vals_size = mod.get_global('table_values') # (device_ptr, size_in_bytes)
     old_vals_gpu = cuda.mem_alloc(table_vals_size)
     # old_vals_gpu = gpuarray.empty((table_vals_size,1), )
     cuda.memcpy_dtod(old_vals_gpu, table_vals, table_vals_size)
     cuda.memcpy_dtod(table_vals, new_vals.gpudata, table_vals_size)
     return old_vals_gpu
예제 #7
0
    def copy(self):
        if not self.flags.forc:
            raise RuntimeError("only contiguous arrays may copied.")

        new = GPUArray(self.shape, self.dtype)
        drv.memcpy_dtod(new.gpudata, self.gpudata, self.nbytes)
        return new
예제 #8
0
 def execute(self):
     ready = self.comm.recv(source=self.op.source_id, tag=TAG_SCATTER)
     if ready:
         drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf,
                         self.tensor.tensor.size * self.op.dtype.itemsize)
     else:
         raise RuntimeError("Synchronization failed!")
예제 #9
0
파일: cuda.py 프로젝트: skallumadi/chainer
def _gpuarray_copy(array):
    if not array.flags.forc:
        raise RuntimeError('only contiguous arrays may copied.')

    new = GPUArray(array.shape, array.dtype, allocator=array.allocator)
    drv.memcpy_dtod(new.gpudata, array.gpudata, array.nbytes)
    return new
예제 #10
0
 def execute(self):
     # Push our fragment into its section of the larger recvr buffer, which assumes gather axis
     # is least contiguous.
     if self.comm.Get_rank() > 0:
         drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata,
                         self.tensor.tensor.size * self.op.dtype.itemsize)
         self.comm.barrier()
예제 #11
0
 def execute(self):
     sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     while (sender_ready == 0):
         sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8)
     drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf,
                     self.tensor.tensor.size * self.op.dtype.itemsize)
     drv.memset_d8(self.sender_ready, 0, 1)
예제 #12
0
def leapfrogStationary(d_x, d_t, v, xmin, xmax, alpha):

  # --- Allocate device memory space for solution
  d_u  = cuda.mem_alloc((N + 1) * (M + 1) * 4)
  d_u1 = cuda.mem_alloc((N + 1)           * 4)
  d_u2 = cuda.mem_alloc((N + 1)           * 4)
  d_u3 = cuda.mem_alloc((N + 1)           * 4)
  # --- Set memory to zero
  cuda.memset_d32(d_u , 0x00, (N + 1) * (M + 1))
  cuda.memset_d32(d_u1, 0x00, (N + 1)          )
  cuda.memset_d32(d_u2, 0x00, (N + 1)          )
  cuda.memset_d32(d_u3, 0x00, (N + 1)          )
  # u     = np.zeros(((M + 1), N + 1))

  blockDim  = (BLOCKSIZE, 1, 1)
  gridDim   = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1)

  # --- Step0
  setStep0(d_u1, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.int32(N), block = blockDim, grid = gridDim)

  # --- Step1
  setStep1(d_u1, d_u2, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.float32(dt), np.int32(N), block = blockDim, grid = gridDim)

  for l in range(1, M - 1):

    updateShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)
    # updateNoShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)
    # updateNoSharedNotWorking(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)

    cuda.memcpy_dtod(d_u1, d_u2, (N + 1) * 4)
    cuda.memcpy_dtod(d_u2, d_u3, (N + 1) * 4)

  return d_u
예제 #13
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a c or f contiguous array, then use the speedy driver kernel
            if self.flags.forc and float(value) >= 0:
                drv.memset_d16(self.gpudata,
                               Flexpt.flex_from_native(value, self.iwl),
                               self.size)
            # otherwise use our copy kerel
            else:
                OpTreeNode.build("copy", value, None, out=self)

        elif isinstance(value, FlexptArray):
            if self.flags.forc and value.flags.forc and self.iwl == value.iwl:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("copy", value, None, out=self)

        elif isinstance(value, OpTreeNode):
            value.execute(out=self)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
예제 #14
0
 def run_step(self, update_pointers, st=None):
     self.sum_in_variable('individual_input',
                          self.inputs['individual_input'],
                          st=st)
     cuda.memcpy_dtod(int(update_pointers['I']),
                      self.inputs['individual_input'].gpudata,
                      self.inputs['individual_input'].nbytes)
예제 #15
0
    def _copy_memory_dtod(self, index):
        # downsample index
        d_index = int(index / self.rate)

        # buffer index
        b_index = d_index % self.buffer_length

        for key in self.spike_vars:
            src = getattr(self.obj, key)
            self.gpu_dct[key][b_index] += src

        if index % self.rate != 0:
            return

        for key in self.dct.keys():
            src = getattr(self.obj, key)
            if key in self.spike_vars:
                continue
            else:
                dst = int(self.gpu_dct[key].gpudata) + b_index * src.nbytes
                cuda.memcpy_dtod(dst, src.gpudata, src.nbytes)

        # dump data to CPU if simulation complete or buffer full
        if (d_index == self.steps - 1) or (b_index == self.buffer_length - 1):
            for key in self.dct.keys():
                buffer = self.get_buffer(key, d_index)
                cuda.memcpy_dtoh(buffer, self.gpu_dct[key].gpudata)
            for key in self.spike_vars:
                self.gpu_dct[key].fill(0.0)
예제 #16
0
 def _read_external_input(self):
     if not self.input_eof or self.frame_count<self.frames_in_buffer:
         cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \
                          self.total_synapses*self.synapse_state.dtype.itemsize), \
                          int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \
                          self.num_input * self.synapse_state.dtype.itemsize)
         self.frame_count += 1
     else:
         self.logger.info('Input end of file reached. Subsequent behaviour is undefined.')
     if self.frame_count >= self._one_time_import and not self.input_eof:
         input_ld = self.input_h5file.root.array.shape[0]
         if input_ld - self.file_pointer < self._one_time_import:
             h_ext = self.input_h5file.root.array.read(self.file_pointer, input_ld)
         else:
             h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self._one_time_import)
         if h_ext.shape[0] == self.I_ext.shape[0]:
             self.I_ext.set(h_ext)
             self.file_pointer += self._one_time_import
             self.frame_count = 0
         else:
             pad_shape = list(h_ext.shape)
             self.frames_in_buffer = h_ext.shape[0]
             pad_shape[0] = self._one_time_import - h_ext.shape[0]
             h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0)
             self.I_ext.set(h_ext)
             self.file_pointer = input_ld
             
         if self.file_pointer == self.input_h5file.root.array.shape[0]:
             self.input_eof = True
예제 #17
0
    def copy(self):
        if not self.flags.forc:
            raise RuntimeError("only contiguous arrays may copied.")

        new = GPUArray(self.shape, self.dtype)
        drv.memcpy_dtod(new.gpudata,self.gpudata,self.nbytes)
        return new
예제 #18
0
    def set(self, queue_adapter, buf, no_async=False):
        device_idx = queue_adapter._device_idx
        assert device_idx == self._device_idx
        self._context_adapter.activate_device(device_idx)

        # PyCUDA needs pointers to be passed as `numpy.number` to kernels,
        # but `memcpy` functions require Python `int`s.
        ptr = int(self._ptr) if isinstance(self._ptr,
                                           numpy.number) else self._ptr

        if isinstance(buf, numpy.ndarray):
            if no_async:
                pycuda_driver.memcpy_htod(ptr, buf)
            else:
                pycuda_driver.memcpy_htod_async(
                    ptr, buf, stream=queue_adapter._pycuda_stream)
        else:
            buf_ptr = int(buf._ptr) if isinstance(buf._ptr,
                                                  numpy.number) else buf._ptr
            if no_async:
                pycuda_driver.memcpy_dtod(ptr, buf_ptr, buf.size)
            else:
                pycuda_driver.memcpy_dtod_async(
                    ptr,
                    buf_ptr,
                    buf.size,
                    stream=queue_adapter._pycuda_stream)
예제 #19
0
파일: cuda.py 프로젝트: ALEXGUOQ/chainer
def _gpuarray_copy(array):
    if not array.flags.forc:
        raise RuntimeError('only contiguous arrays may copied.')

    new = GPUArray(array.shape, array.dtype, allocator=array.allocator)
    drv.memcpy_dtod(new.gpudata, array.gpudata, array.nbytes)
    return new
예제 #20
0
파일: LPU.py 프로젝트: ctshih/neurokernel
    def _read_external_input(self):
        if not self.input_eof or self.frame_count < self.frames_in_buffer:
            cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \
                             self.total_synapses*self.synapse_state.dtype.itemsize), \
                             int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \
                             self.num_input * self.synapse_state.dtype.itemsize)
            self.frame_count += 1
        else:
            self.logger.info(
                'Input end of file reached. Subsequent behaviour is undefined.'
            )
        if self.frame_count >= self._one_time_import and not self.input_eof:
            input_ld = self.input_h5file.root.array.shape[0]
            if input_ld - self.file_pointer < self._one_time_import:
                h_ext = self.input_h5file.root.array.read(
                    self.file_pointer, input_ld)
            else:
                h_ext = self.input_h5file.root.array.read(
                    self.file_pointer,
                    self.file_pointer + self._one_time_import)
            if h_ext.shape[0] == self.I_ext.shape[0]:
                self.I_ext.set(h_ext)
                self.file_pointer += self._one_time_import
                self.frame_count = 0
            else:
                pad_shape = list(h_ext.shape)
                self.frames_in_buffer = h_ext.shape[0]
                pad_shape[0] = self._one_time_import - h_ext.shape[0]
                h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0)
                self.I_ext.set(h_ext)
                self.file_pointer = input_ld

            if self.file_pointer == self.input_h5file.root.array.shape[0]:
                self.input_eof = True
예제 #21
0
 def matvec(self, v):
     x = v.reshape((self.D, self.D))
     
     self.xG.set(x)
     #self.out2.set(self.xG)
     #self.out2[:] = self.xG
     cd.memcpy_dtod(self.out2.gpudata, self.xG.gpudata, self.xG.nbytes)
     
     out = [self.out, self.out_p]
     out2 = [self.out2, self.out2_p]
     
     if self.left: #Multiplying from the left, but x is a col. vector, so use mat_dagger
         for k in range(len(self.A1G)):
             if self.use_batch:
                 eps_l_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], 
                                  self.tmp_p, self.tmp2_p, self.tmp2, self.hdl)
             else:
                 eps_l_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0],
                                     self.tmp, self.tmp2, self.ones, self.zeros,
                                     self.streams, self.hdl)
             out, out2 = out2, out
         Ehx = out2[0]
         
         if self.pseudo:
             QEQhx = Ehx - self.lG * m.adot(self.r, x)
             #res = QEQhx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1)
             cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), 
                            QEQhx.gpudata, 1, self.xG.gpudata, 1)
             res = self.xG
         else:
             #res = Ehx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1)
             cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), 
                            Ehx.gpudata, 1, self.xG.gpudata, 1)
             res = self.xG
     else:
         for k in range(len(self.A2G) - 1, -1, -1):
             if self.use_batch:
                 eps_r_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], 
                                  self.tmp_p, self.tmp2_p, self.tmp2, self.hdl)
             else:
                 eps_r_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0],
                                     self.tmp, self.tmp2, self.ones, self.zeros,
                                     self.streams, self.hdl)
             out, out2 = out2, out
         Ex = out2[0]
         
         if self.pseudo:
             QEQx = Ex - self.rG * m.adot(self.l, x)
             #res = QEQx.mul_add(-sp.exp(1.j * self.p), self.xG, 1)
             cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), 
                            QEQx.gpudata, 1, self.xG.gpudata, 1)
             res = self.xG
         else:
             #res = Ex.mul_add(-sp.exp(1.j * self.p), self.xG, 1)
             cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), 
                            Ex.gpudata, 1, self.xG.gpudata, 1)
             res = self.xG
     
     return res.get().ravel()
예제 #22
0
def set_data(filenames, file_count,subb, config, count, cur, img_mean, gpu_data, gpu_data_remote, ctx, icomm,img_batch_empty):

    load_time = time.time()
    data=None
    
#    aa = config['rank']+count/subb*size
#    img_list = range(aa*config['file_batch_size'],(aa+1)*config['file_batch_size'],1) 
    #print rank, img_list
    if config['data_source'] in ['hkl','both']:
        data_hkl = hkl.load(str(filenames[file_count]))# c01b
        data = data_hkl
        
    if config['data_source'] in ['lmdb', 'both']:       
        data_lmdb = lmdb_load_cur(cur,config,img_batch_empty) 
        data = data_lmdb
                        
    if config['data_source']=='both': 
        if config['rank']==0: print (rank,(data_hkl-data_lmdb)[1,0:3,1,1].tolist())
        
    load_time = time.time()-load_time #)*

    sub_time = time.time() #(
    data = data -img_mean
    sub_time = time.time()-sub_time

    crop_time = time.time() #(

    for minibatch_index in range(subb):
        count+=1
        
        batch_data = data[:,:,:,minibatch_index*config['batch_size']:(minibatch_index+1)*batch_size]
        if mode == 'train':
            rand_arr = get_rand3d(config['random'], count+(rank+1)*n_files*(subb))
        else:
            rand_arr = np.float32([0.5, 0.5, 0]) 
        batch_data = crop_and_mirror(batch_data, rand_arr, flag_batch=config['batch_crop_mirror'],cropsize=config['input_width'])
        gpu_data[minibatch_index].set(batch_data)   

    crop_time = time.time() - crop_time #)
	
    #print 'load_time:  %f (load %f, sub %f, crop %f)' % (load_time+crop_time+sub_time, load_time,sub_time, crop_time)
    
    # wait for computation on last file to finish
    msg = icomm.recv(source=MPI.ANY_SOURCE,tag=35)
    assert msg == "calc_finished"
    
    for minibatch_index in range(subb):
        # copy from preload area
        drv.memcpy_dtod(gpu_data_remote[minibatch_index].ptr,
                        gpu_data[minibatch_index].ptr,
                        gpu_data[minibatch_index].dtype.itemsize *
                        gpu_data[minibatch_index].size
                        )

    ctx.synchronize()

    icomm.isend("copy_finished",dest=0,tag=55)
    
    return count
예제 #23
0
 def execute(self):
     """
     Receive tensor
     """
     ready = self.comm.recv(source=self.source, tag=TAG_DIRECT)
     if ready:
         drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf,
                         self.tensor.tensor.size * self.buf_item_size)
예제 #24
0
 def pre_run(self, update_pointers):
     if self.params_dict.has_key('initV'):
         cuda.memcpy_dtod(int(update_pointers['V']),
                          self.params_dict['initV'].gpudata,
                          self.params_dict['initV'].nbytes)
         cuda.memcpy_dtod(self.internal_states['internalV'].gpudata,
                          self.params_dict['initV'].gpudata,
                          self.params_dict['initV'].nbytes)
예제 #25
0
 def execute(self):
     # gather send execution is done here
     send_op_tensor = self.tensor_view_from_td(
         self.send_op[0].args[0].tensor_description())
     drv.memcpy_dtod(
         self.tensor.tensor.gpudata, send_op_tensor.tensor.gpudata,
         send_op_tensor.tensor.size * self.send_op[0].dtype.itemsize)
     self.comm.barrier()
예제 #26
0
파일: parray.py 프로젝트: bionet/vtem
 def copy(self):
     """
     returns a duplicated copy of self
     """
     result = self._new_like_me()
     if self.size:
         cuda.memcpy_dtod(result.gpudata, self.gpudata, self.mem_size * self.dtype.itemsize)
     
     return result
예제 #27
0
파일: array.py 프로젝트: ooreilly/vecpy
 def copy(self):
     tmp = np.ndarray((1, ))
     out = Array(tmp, None)
     out.nbytes = self.nbytes
     out.x = cuda.mem_alloc(self.nbytes)
     cuda.memcpy_dtod(out.x, self.x, out.nbytes)
     out.dtype = self.dtype
     out.shape = self.shape
     return out
예제 #28
0
    def _loadInput(self, stim):
        logging.debug('loadInput')

        # shortcuts
        nrXY = self.nrX * self.nrY
        nrXYD = self.nrX * self.nrY * self.nrDirs

        # parse input
        assert type(stim).__module__ == "numpy", "stim must be numpy array"
        assert type(stim).__name__ == "ndarray", "stim must be numpy.ndarray"
        assert stim.size > 0, "stim cannot be []"
        stim = stim.astype(np.ubyte)

        rows, cols = stim.shape
        logging.debug("- stim shape={0}x{1}".format(rows, cols))

        # shift d_stimBuf in time by 1 frame, from frame i to frame i-1
        # write our own memcpy kernel... :-(
        gdim = (int(iDivUp(nrXY, 128)), 1)
        bdim = (128, 1, 1)
        for i in xrange(1, self.nrT):
            stimBufPt_dst = np.intp(self.d_stimBuf) + self.szXY * (i - 1)
            stimBufPt_src = np.intp(self.d_stimBuf) + self.szXY * i
            self.dev_memcpy_dtod(stimBufPt_dst,
                                 stimBufPt_src,
                                 np.int32(nrXY),
                                 block=bdim,
                                 grid=gdim)

        # index into d_stimBuf array to place the new stim at the end
        # (newest frame at pos: nrT-1)
        d_stimBufPt = np.intp(self.d_stimBuf) + self.szXY * (self.nrT - 1)

        # \TODO implement RGB support
        self.dev_split_gray(d_stimBufPt,
                            cuda.In(stim),
                            np.int32(stim.size),
                            block=bdim,
                            grid=gdim)

        # create working copy of d_stimBuf
        cuda.memcpy_dtod(self.d_scalingStimBuf, self.d_stimBuf,
                         self.szXY * self.nrT)

        # reset V1complex responses to 0
        # \FIXME not sure how to use memset...doesn't seem to give expected
        # result
        tmp = np.zeros(nrXYD).astype(np.float32)
        cuda.memcpy_htod(self.d_respV1c, tmp)

        # allocate d_resp, which will contain the response to all 28
        # (nrFilters) space-time orientations at 3 (nrScales) scales for
        # every pixel location (nrX*nrY)
        tmp = np.zeros(nrXY * self.nrFilters * self.nrScales).astype(
            np.float32)
        cuda.memcpy_htod(self.d_resp, tmp)
예제 #29
0
 def add_initializer(self, var_a, var_b, update_pointers):
     if var_a in self.params_dict:
         if var_b in self.internal_states:
             cuda.memcpy_dtod(self.internal_states[var_b].gpudata,
                              self.params_dict[var_a].gpudata,
                              self.params_dict[var_a].nbytes)
         if var_b in update_pointers:
             cuda.memcpy_dtod(int(update_pointers[var_b]),
                              self.params_dict[var_a].gpudata,
                              self.params_dict[var_a].nbytes)
예제 #30
0
    def _loadInput(self, stim):
        logging.debug('loadInput')

        # shortcuts
        nrXY = self.nrX * self.nrY
        nrXYD = self.nrX * self.nrY * self.nrDirs

        # parse input
        assert type(stim).__module__ == "numpy", "stim must be numpy array"
        assert type(stim).__name__ == "ndarray", "stim must be numpy.ndarray"
        assert stim.size > 0, "stim cannot be []"
        stim = stim.astype(np.ubyte)

        rows, cols = stim.shape
        logging.debug("- stim shape={0}x{1}".format(rows, cols))

        # shift d_stimBuf in time by 1 frame, from frame i to frame i-1
        # write our own memcpy kernel... :-(
        gdim = (int(iDivUp(nrXY, 128)), 1)
        bdim = (128, 1, 1)
        for i in xrange(1, self.nrT):
            stimBufPt_dst = np.intp(self.d_stimBuf) + self.szXY * (i - 1)
            stimBufPt_src = np.intp(self.d_stimBuf) + self.szXY * i
            self.dev_memcpy_dtod(
                stimBufPt_dst,
                stimBufPt_src,
                np.int32(nrXY),
                block=bdim, grid=gdim)

        # index into d_stimBuf array to place the new stim at the end
        # (newest frame at pos: nrT-1)
        d_stimBufPt = np.intp(self.d_stimBuf) + self.szXY * (self.nrT-1)

        # \TODO implement RGB support
        self.dev_split_gray(
            d_stimBufPt,
            cuda.In(stim),
            np.int32(stim.size),
            block=bdim, grid=gdim)

        # create working copy of d_stimBuf
        cuda.memcpy_dtod(self.d_scalingStimBuf, self.d_stimBuf,
                         self.szXY*self.nrT)

        # reset V1complex responses to 0
        # \FIXME not sure how to use memset...doesn't seem to give expected
        # result
        tmp = np.zeros(nrXYD).astype(np.float32)
        cuda.memcpy_htod(self.d_respV1c, tmp)

        # allocate d_resp, which will contain the response to all 28
        # (nrFilters) space-time orientations at 3 (nrScales) scales for
        # every pixel location (nrX*nrY)
        tmp = np.zeros(nrXY*self.nrFilters*self.nrScales).astype(np.float32)
        cuda.memcpy_htod(self.d_resp, tmp)
예제 #31
0
 def cache_z(self, z):
     x = np.require(z.real,
                    dtype=np.double,
                    requirements=['A', 'W', 'O', 'C'])
     y = np.require(z.imag,
                    dtype=np.double,
                    requirements=['A', 'W', 'O', 'C'])
     xd = gpuarray.to_gpu(x)
     yd = gpuarray.to_gpu(y)
     cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes)
     cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
예제 #32
0
파일: parray.py 프로젝트: bionet/vtem
def arrayp2g(pary):
    """convert a PitchArray to a GPUArray"""
    from pycuda.gpuarray import GPUArray
    result = GPUArray(pary.shape, pary.dtype)
    if pary.size:
        if pary.M == 1:
            cuda.memcpy_dtod(result.gpudata, pary.gpudata, pary.mem_size * pary.dtype.itemsize)
        else:
            PitchTrans(pary.shape, result.gpudata, _pd(result.shape), pary.gpudata, pary.ld, pary.dtype)
            
    return result
예제 #33
0
파일: LPU.py 프로젝트: prabindh/neurokernel
 def _update_buffer(self):
     if self.my_num_gpot_neurons>0:
         cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) + \
             self.buffer.gpot_current*self.buffer.gpot_buffer.ld* \
             self.buffer.gpot_buffer.dtype.itemsize, self.V.gpudata, \
             self.V.nbytes)
     if self.my_num_spike_neurons>0:
         cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) + \
             self.buffer.spike_current*self.buffer.spike_buffer.ld* \
             self.buffer.spike_buffer.dtype.itemsize, self.spike_state.gpudata,\
             int(self.spike_state.dtype.itemsize*self.my_num_spike_neurons))
예제 #34
0
 def _set_state(self, k, v):
     cls = type(self)
     if k in self.params_dict:
         cuda.memcpy_dtod(self.states[k].gpudata,
                          self.params_dict[k].gpudata,
                          self.params_dict[k].nbytes)
     else:
         if isinstance(v, float):
             self.states[k].fill(self.floattype(v))
         else:
             assert(v in cls.states)
             self.states[k].fill(self.floattype(cls.states[v]))
예제 #35
0
파일: LPU.py 프로젝트: yiyin/neurokernel
 def _update_buffer(self):
     if self.total_num_gpot_neurons>0:
         cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) +
             self.buffer.gpot_current*self.buffer.gpot_buffer.ld*
             self.buffer.gpot_buffer.dtype.itemsize,
             self.V.gpudata, self.V.nbytes)
     if self.total_num_spike_neurons>0:
         cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) +
             self.buffer.spike_current*self.buffer.spike_buffer.ld*
             self.buffer.spike_buffer.dtype.itemsize,
             self.spike_state.gpudata,
             int(self.spike_state.dtype.itemsize*self.total_num_spike_neurons))
예제 #36
0
def gpuarray_copy(u):
    """Copes a gpuarray object.

    Args:
        u (gpuarray): Input array.

    Returns:
        gpuarra: Deep copy of input array.
    """
    v = gpuarray.zeros_like(u)
    v.strides = u.strides
    cuda.memcpy_dtod(v.gpudata, u.gpudata, u.nbytes)
    return v
예제 #37
0
    def update(self):
        nn, ne, nne = np.int32([self.nn, self.ne, self.nne])
        dt, de, vf = np.float64([self.dt, self.de, self.vf])
        bs, gs = (256,1,1), (self.nn//256+1,1)
        ul, ul_prev, ul_tmp = self.ul_gpu, self.ul_prev_gpu, self.ul_tmp_gpu
        kl = self.kl_gpu
        el_sum = self.el_sum_gpu
        c_ul_tmps = np.float32([0, 0.5, 0.5, 1])
        c_uls = np.float32([1./6, 1./3, 1./3, 1./6])

        cuda.memcpy_dtod(ul_prev, ul, self.ul.nbytes)
        for c_ul_tmp, c_ul in zip(c_ul_tmps, c_uls):
            self.update_pre(nn, nne, vf, c_ul_tmp, ul, ul_prev, ul_tmp, kl, el_sum, block=bs, grid=gs)
            self.update_ul(nn, ne, nne, dt, de, vf, c_ul, ul, ul_tmp, kl, el_sum, block=bs, grid=gs)
예제 #38
0
파일: LPU.py 프로젝트: prabindh/neurokernel
    def update_other_rest(self, gpot_data, my_num_gpot_neurons, num_virtual_gpot_neurons):
        if self.num_gpot_neurons > 0:
            d_other_rest = garray.zeros(num_virtual_gpot_neurons, np.double)
            a = 0
            for data in gpot_data.itervalues():
                if len(data) > 0:
                    cuda.memcpy_htod(int(d_other_rest.gpudata) +  a , data)
                    a += data.nbytes

            for i in range(self.gpot_delay_steps):
                cuda.memcpy_dtod( int(self.gpot_buffer.gpudata) + \
                    (self.gpot_buffer.ld * i + int(my_num_gpot_neurons)) * \
                    self.gpot_buffer.dtype.itemsize, d_other_rest.gpudata, \
                    d_other_rest.nbytes )
예제 #39
0
    def get_cuda_tensor(self, sim, read_img=False):
        self.batch_renderer.render(sim)
        self.batch_renderer.map()
        torch_img = torch.cuda.ByteTensor(self.img_width, self.img_height, 3)
        torch_pointer = torch_img.data_ptr()

        render_pointer = self.batch_renderer._cuda_rgb_ptr
        cuda.memcpy_dtod(torch_pointer, render_pointer,
                         3 * self.img_width * self.img_height)
        true_img = None
        if read_img:
            true_img = self.batch_renderer.read()[0][0]
        self.batch_renderer.unmap()
        return torch_img, true_img
예제 #40
0
 def slice_positions(self):
     '''Position of the respective slice start within the array
     self.particle_indices_by_slice .
     '''
     if not hasattr(self, '_slice_positions'):
         # the last entry of slice_positions needs to be n_slices,
         # the other entries are the same as lower_bounds
         self._slice_positions = gpuarray.zeros(
             self.n_slices + 1, dtype=self.lower_bounds.dtype)
         self._slice_positions += self.n_slices
         cuda.memcpy_dtod(self._slice_positions.gpudata,
                          self.lower_bounds.gpudata,
                          self.lower_bounds.nbytes)
     return self._slice_positions
예제 #41
0
def stepFunction():
  global animIter
  cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = (gpuarray.max(plotDataFloat_d)).get()
  multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d )
  floatToUchar( plotDataFloat_d, plotDataChars_d)
  copyToScreenArray()
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
예제 #42
0
    def execute(self):
        if self.recvr_buf is None:
            # set_ipc_handle must be called before open_ipc_handle in certain cases to avoid a
            # hang, hence calling set_ in bind_buffers and open_ in execute.
            # See corresponding comment in ScatterRecv kernel for details.
            (self.tnsr_ipc_hdl, self.send_ready) = open_ipc_handle(
                self.op._shared_queues[self.op.idx])
            chunk_size = self.tensor.tensor.size * self.op.dtype.itemsize
            self.recvr_buf = int(self.tnsr_ipc_hdl) + self.op.idx * chunk_size

        # Push our fragment into its section of the larger recvr buffer, which assumes gather axis
        # is least contiguous.
        drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata,
                        self.tensor.tensor.size * self.op.dtype.itemsize)
        drv.memset_d8(self.send_ready, 1, 1)
예제 #43
0
파일: recorder.py 프로젝트: TK-21st/neural
    def _copy_memory_dtod(self, index):
        # downsample index
        d_index = int(index / self.rate)
        # buffer index
        b_index = d_index % self.buffer_length
        for key in self.dct.keys():
            src = getattr(self.obj, key)
            dst = int(self.gpu_dct[key].gpudata) + b_index * src.nbytes

            cuda.memcpy_dtod(dst, src.gpudata, src.nbytes)

        if (d_index == self.steps - 1) or (b_index == self.buffer_length - 1):
            for key in self.dct.keys():
                buffer = self.get_buffer(key, d_index)
                cuda.memcpy_dtoh(buffer, self.gpu_dct[key].gpudata)
예제 #44
0
def stepFunction():
  global animIter
  if showActivity: 
    cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks )
    findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D  )
    getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) 
  cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes )
  maxVal = gpuarray.max( plotData_d ).get()
  scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d )
  if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ]
  else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ]
  if plotting and animIter%25 == 0: 
    maxVals.append( maxVal )
    sumConc.append( gpuarray.sum(concentrationIn_d).get() )
    plotData( maxVals, sumConc )
  animIter += 1
예제 #45
0
파일: LPU.py 프로젝트: neurokernel/lamina
 def _get_external_input(self):
     # use of intermediate I_ext can possibly be avoided
     input_ext = self.input_generator.next_input()
     if type(input_ext) == np.ndarray:
         self.I_ext.set(input_ext)
         cuda.memcpy_dtod(
             int(int(self.synapse_state.gpudata) +
             self.total_synapses*self.synapse_state.dtype.itemsize),
             int(self.I_ext.gpudata),
             self.num_input*self.synapse_state.dtype.itemsize)
     else:
         cuda.memcpy_dtod(
             int(int(self.synapse_state.gpudata) +
             self.total_synapses*self.synapse_state.dtype.itemsize),
             int(input_ext.gpudata),
             self.num_input*self.synapse_state.dtype.itemsize)
예제 #46
0
    def _update_buffer(self):
        """
        Update circular buffer of past neuron states.
        """

        if self.total_num_gpot_neurons>0:
            cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) +
                self.buffer.gpot_current*self.buffer.gpot_buffer.ld*
                self.buffer.gpot_buffer.dtype.itemsize,
                self.V.gpudata, self.V.dtype.itemsize*self.total_num_gpot_neurons)
        if self.total_num_spike_neurons>0:
            cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) +
                self.buffer.spike_current*self.buffer.spike_buffer.ld*
                self.buffer.spike_buffer.dtype.itemsize,
                self.spike_state.gpudata,
                int(self.spike_state.dtype.itemsize*self.total_num_spike_neurons))
예제 #47
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
예제 #48
0
def timeTransition():
  global realDynamics, alpha, applyTransition
  realDynamics = not realDynamics
  applyTransition = False
  if realDynamics:
    cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes)
    cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes)
    if realTEXTURE:
      copy3DpsiK1Real()
      copy3DpsiK1Imag()
      copy3DpsiK2Real()
      copy3DpsiK2Imag()
    print "Real Dynamics"
  else:    
    #GetAlphas
    getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D)
    alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() )  #OPTIMIZACION 
    print "Imaginary Dynamics"
예제 #49
0
파일: LPU.py 프로젝트: prabindh/neurokernel
 def _read_external_input(self):
     if self.input_eof:
         return
     cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \
         self.total_synapses*self.synapse_state.dtype.itemsize), \
         int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \
         self.num_input * self.synapse_state.dtype.itemsize)
     self.frame_count += 1
     if self.frame_count >= self.one_time_import:
         h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self.one_time_import)
         if h_ext.shape[0] == self.I_ext.shape[0]:
             self.I_ext.set(h_ext)
             self.file_pointer += self.one_time_import
             self.frame_count = 0
         else:
             if self.file_pointer == self.input_h5file.root.array.shape[0]:
                 self.logger.info('Input end of file reached. Behaviour is ' +\
                                 'undefined for subsequent steps')
                 self.input_eof = True
예제 #50
0
파일: permute.py 프로젝트: shwina/cooper
def cuTranspose_permute_ept1(a_d, b_d, permutation):
    N = b_d.shape[0]
    if permutation == (2, 1, 0): #210
        permute_210_ept1.prepared_call((N/32, N/32, N), (32, 32, 1),
                b_d.gpudata, a_d.gpudata, N, N, N)
    elif permutation == (0, 2, 1): #102
        permute_102_ept1.prepared_call((N/32, N/32, N), (32, 32, 1),
                b_d.gpudata, a_d.gpudata, N, N, N)
    elif permutation == (1, 0, 2): #021
        permute_021_ept1.prepared_call((N/32, N/32, N), (32, 32, 1),
                b_d.gpudata, a_d.gpudata, N, N, N)
    elif permutation == (1, 2, 0): #201
        permute_201_ept1.prepared_call((N/32, N/32, N), (32, 32, 1),
                b_d.gpudata, a_d.gpudata, N, N, N)
    elif permutation == (2, 0, 1): #120
        permute_120_ept1.prepared_call((N/32, N/32, N), (32, 32, 1),
                b_d.gpudata, a_d.gpudata, N, N, N)
    elif permutation == (0, 1, 2):
        cuda.memcpy_dtod(b_d.gpudata, a_d.gpudata, b_d.nbytes)
 def DT_GPU(self, X, c):
     DIM = X.size
     floatSize = X.dtype.itemsize
     q = gpuarray.empty(DIM / 2, X.dtype)
     p = gpuarray.empty(DIM / 2, X.dtype)
     XNext = gpuarray.empty(DIM, X.dtype)
     cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2)
     qNext = q + c * self.dt * self.dTdp(p)
     pNext = p
     cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2)
     return XNext
 def DV_GPU(self, X, d):
     DIM = X.size
     floatSize = X.dtype.itemsize
     q = gpuarray.empty(DIM / 2, X.dtype)
     p = gpuarray.empty(DIM / 2, X.dtype)
     XNext = gpuarray.empty(DIM, X.dtype)
     cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2)
     qNext = q
     pNext = p - d * self.dt * self.dVdq(q)
     cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2)
     return XNext
예제 #53
0
    def __setitem__(self, key, other):
        # if args is [:] then assign `other` to the entire ndarray
        if key == slice(None):
            if isinstance(other, pycuda.gpuarray.GPUArray):
                if self.flags.forc and other.flags.forc:
                    # both arrays are a contiguous block of memory
                    cuda.memcpy_dtod(self.gpudata, other.gpudata, self.nbytes)
                    return
            else:
                if self.flags.forc:
                    # both arrays are a contiguous block of memory
                    cuda.memcpy_htod(self.gpudata, other)
                    return

            copy_non_contiguous(self, other)
            return

        # assign `other` to sub-array of self
        sub_array = self[key]
        sub_array[:] = other
예제 #54
0
파일: parray.py 프로젝트: bionet/vtem
def make_pitcharray(dptr, shape, dtype, linear = False, pitch=None):
    """
    create a PitchArray from a DeviceAllocation pointer
    linear: "True" indicates the device memory is a linearly allocated 
            "False" indicates the device memory is allocated by cudaMallocPitch,
            and pitch must be provided
    """
    
    if linear:
        result = PitchArray(shape, dtype)
        if result.size:
            if result.M == 1:
                cuda.memcpy_dtod(result.gpudata, dptr, result.mem_size * dtype.itemsize)
            else:
                PitchTrans(shape, result.gpudata, result.ld, dptr, _pd(shape), dtype)
                
    else:
        result = PitchArray(shape, dtype, gpudata=dptr, pitch = pitch)
    
    return result
예제 #55
0
파일: parray.py 프로젝트: bionet/vtem
 def copy_rows(self, start, stop, step = 1):
     nrows = len(range(start,stop,step))
     if nrows:
         
         if self.ndim == 2:
             shape = (nrows, self.shape[1])
         else:
             shape = (nrows, self.shape[1], self.shape[2])
     else:
         if self.ndim == 2:
             shape = (nrows, 0)
         else:
             shape = (nrows, 0, 0)
     
     result = PitchArray(shape, self.dtype)
     
     if nrows > 1:
         PitchTrans(shape, result.gpudata, result.ld, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.ld * step, self.dtype)
     elif nrows == 1:
         cuda.memcpy_dtod(result.gpudata, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.dtype.itemsize * _pd(shape))
     return result
예제 #56
0
def matrix_addition(d_a, d_b):
    # Overwrites d_a
    assert d_a.shape == d_b.shape
    if len(d_a.shape) == 1:
        # Vector addition
        cublas.cublasSaxpy(handle, d_a.size, 1.0, d_b.gpudata, 1, d_a.gpudata, 1)
    elif len(d_a.shape) == 2:
        # Matrix addition
        m, n = d_a.shape
        cublas.cublasSgeam(handle,
                'N', 'N',
                m, n,
                1.0,
                d_a.gpudata, m,
                1.0,
                d_b.gpudata, m,
                d_a.gpudata, m)
    else:
        tmp = (d_a.ravel() + d_b.ravel()).reshape(d_a.shape)
        cuda.memcpy_dtod(d_a.gpudata, tmp.gpudata, d_a.nbytes)
    return d_a
예제 #57
0
파일: lbm.py 프로젝트: jkotur/particles
	def wave_cu( self , dt ) :
		mpos = cuda_gl.BufferObject( long( self.gpos ) )
		dpos = mpos.map()

#        self._debug_print()

		self.collision.prepared_call( self.grid , self.block ,
			self.df1 , self.BOX , self.BOX , self.BOX )

#        self._debug_print()

		self.streaming.prepared_call( self.grid , self.block ,
			self.df1 , self.df2 , self.BOX , self.BOX , self.BOX )

		cuda_driver.memcpy_dtod( self.df1 , self.df2 , self.f.nbytes )

		self.colors.prepared_call( self.grid , self.block ,
			dpos.device_ptr() , self.df1 , self.BOX , self.BOX , self.BOX )

		dpos.unmap()
		mpos.unregister()