Пример #1
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = drv.SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        strm = drv.Stream()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), drv.In(a), drv.In(b),
                block=shape+(1,), stream=strm)
        strm.synchronize()

        self.assert_(la.norm(dest-a*b) == 0)
Пример #2
0
 def __call__(self):
     spikes = self.collected_spikes[:self.nspikes]
     total_neurons = self.net.total_neurons
     if self.use_gpu:
         if not hasattr(self, 'spikes_gpu'):
             spikes_bool = drv.pagelocked_zeros(total_neurons, dtype=uint32)
             spikes_bool[spikes] = True
             spikes_gpu = pycuda.gpuarray.to_gpu(spikes_bool)
             spikes_gpu_ptr = int(int(spikes_gpu.gpudata))
             self.spikes_bool = spikes_bool
             self.spikes_gpu = spikes_gpu
             self.spikes_gpu_ptr = spikes_gpu_ptr
         else:
             spikes_bool = self.spikes_bool
             spikes_bool[:] = False
             spikes_bool[spikes] = True
             spikes_gpu = self.spikes_gpu
             pycuda.driver.memcpy_htod(spikes_gpu.gpudata, spikes_bool)
             spikes_gpu_ptr = self.spikes_gpu_ptr
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_gpu_ptr, total_neurons)
         if not hasattr(self, 'acc'):
             self.acc = acc = drv.pagelocked_zeros(total_neurons, dtype=float32)
         else:
             acc = self.acc
         pycuda.driver.memcpy_dtoh(acc, acc_ptr)
     else:
         spikes_ptr = spikes.ctypes.data
         spikes_len = len(spikes)
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_ptr, spikes_len)
         acc = numpy_array_from_memory(acc_ptr, total_neurons, float32)
     for _, targetvar, targetslice in self.net.nemo_propagate_targets:
         targetvar += acc[targetslice]
     self.nspikes = 0
Пример #3
0
  def _allocate_arrays(self):
    #allocate gpu arrays and numpy arrays.
    if self.max_features < 4:
      imp_size = 4
    else:
      imp_size = self.max_features
    
    #allocate gpu arrays
    self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32)
    self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32)
    self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts)
    self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices)  
    self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, 
        self.dtype_indices)
    self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32)
    self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts)
    self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16)
    self.mark_table = gpuarray.empty(self.stride, np.uint8) 

    #allocate numpy arrays
    self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32)
    self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8)
    self.nid_array = np.zeros(self.n_samples, dtype = np.uint32)
    self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices)
    self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8)
    self.threshold_value_idx = np.zeros(2, self.dtype_indices)
    self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32)  
    self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16)
    self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
Пример #4
0
  def _allocate_arrays(self):
    #allocate gpu arrays and numpy arrays.
    if self.max_features < 4:
      imp_size = 4
    else:
      imp_size = self.max_features
    
    #allocate gpu arrays
    self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32)
    self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32)
    self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts)
    self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices)  
    self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, 
        self.dtype_indices)
    self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32)
    self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts)
    self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16)
    self.mark_table = gpuarray.empty(self.stride, np.uint8) 

    #allocate numpy arrays
    self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32)
    self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8)
    self.nid_array = np.zeros(self.n_samples, dtype = np.uint32)
    self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices)
    self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8)
    self.threshold_value_idx = np.zeros(2, self.dtype_indices)
    self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32)  
    self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16)
    self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
Пример #5
0
    def prepare(self, P):
        n = len(P.state_(self.eqs._diffeq_names_nonzero[0]))
        var_len = len(dict.fromkeys(
            self.eqs._diffeq_names)) + 1  # +1 needed to store t

        for index, varname in enumerate(self.eqs._diffeq_names):
            self.index_to_varname.append(varname)
            self.varname_to_index[varname] = index
            if varname in self.eqs._diffeq_names_nonzero:
                self.index_nonzero.append(index)

        self.S_in = cuda.pagelocked_zeros((n, var_len), numpy.float64)

        self.S_out = cuda.pagelocked_zeros((n, var_len), numpy.float64)

        nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize
        self.S_in_gpu = cuda.mem_alloc(nbytes)
        self.S_out_gpu = cuda.mem_alloc(nbytes)

        Z = zeros((n, var_len))
        self.A_gpu = cuda.mem_alloc(nbytes)
        cuda.memcpy_htod(self.A_gpu, Z)
        self.B_gpu = cuda.mem_alloc(nbytes)
        cuda.memcpy_htod(self.B_gpu, Z)
        self.S_temp_gpu = cuda.mem_alloc(nbytes)

        modFun = {}
        self.applyFun = {}
        for x in self.index_nonzero:
            s = self.eqs._function_C_String[self.index_to_varname[x]]
            args_fun = []
            for i in xrange(var_len):
                args_fun.append("S_temp[" + str(i) +
                                " + blockIdx.x * var_len]")
            modFun[x] = SourceModule("""
                __device__ double f""" + s + """
                
                __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len)
                { 
                    
                    int idx = x + blockIdx.x * var_len;
                    S_temp[idx] = 0;
                    B[idx] = f(""" + ",".join(args_fun) + """);
                    S_temp[idx] = 1;
                    A[idx] = f(""" + ",".join(args_fun) + """) - B[idx];
                    B[idx] /= A[idx];
                    S_temp[idx] = S_in[idx];
                }
                """)
            self.applyFun[x] = modFun[x].get_function("applyFun")
            self.applyFun[x].prepare(['P', 'P', 'P', 'P', 'i', 'i'],
                                     block=(1, 1, 1))

        self.calc_dict = {}
        self.already_calc = {}
Пример #6
0
    def getRT(self, s_map, srt_gpu, srt_nsamp, srt_npairs, npairs, store_rt=False):
        """
        Computes the rank template

        s_map(Sample Map) -  an list of 1s and 0s of length nsamples where 1 means use this sample
            to compute rank template
        srt_gpu - cuda memory object containing srt(sample rank template) array on gpu
        srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object
        npairs - true number of gene pairs being compared
        b_size - size of the blocks for computation
        store_rt - determines the RETURN value
            False(default) = returns an numpy array shape(npairs) of the rank template
            True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded)
        """

        b_size = self.b_size
        s_map_buff = self.s_map_buff = cuda.pagelocked_zeros((int(srt_nsamp),), np.int32,  mem_flags=cuda.host_alloc_flags.DEVICEMAP)

        s_map_buff[:len(s_map)] =  np.array(s_map,dtype=np.int32)

        s_map_gpu = np.intp(s_map_buff.base.get_device_pointer())
        #cuda.memcpy_htod(s_map_gpu, s_map_buff)
        
        #sample blocks
        g_y_sz = self.getGrid( srt_nsamp)
        #pair blocks
        g_x_sz = self.getGrid( srt_npairs )
        
        block_rt_gpu =  cuda.mem_alloc(int(g_y_sz*srt_npairs*(np.uint32(1).nbytes)) ) 


        grid = (g_x_sz, g_y_sz)

        func1,func2 = self.getrtKern(g_y_sz)

        shared_size = b_size*b_size*np.uint32(1).nbytes

        func1( srt_gpu, np.uint32(srt_nsamp), np.uint32(srt_npairs), s_map_gpu, block_rt_gpu, np.uint32(g_y_sz), block=(b_size,b_size,1), grid=grid, shared=shared_size)

        rt_buffer =self.rt_buffer = cuda.pagelocked_zeros((int(srt_npairs),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
        rt_gpu = np.intp(rt_buffer.base.get_device_pointer())

        func2( block_rt_gpu, rt_gpu, np.int32(s_map_buff.sum()), block=(b_size,1,1), grid=(g_x_sz,))

        
        if store_rt:
            #this is in case we want to run further stuff without 
            #transferring back and forth
            return (rt_gpu, srt_npairs)
        else:
            #rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32)
            #cuda.memcpy_dtoh(rt_buffer, rt_gpu)
            #rt_gpu.free()
            return rt_buffer[:npairs]
Пример #7
0
 def prepare(self, P):
     n = len(P.state_(self.eqs._diffeq_names_nonzero[0]))
     var_len  = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t
     
     for index,varname in enumerate(self.eqs._diffeq_names):
         self.index_to_varname.append(varname)
         self.varname_to_index[varname]= index
         if varname in self.eqs._diffeq_names_nonzero :
             self.index_nonzero.append(index)
     
     self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize
     self.S_in_gpu = cuda.mem_alloc(nbytes)
     self.S_out_gpu = cuda.mem_alloc(nbytes)
     
     Z = zeros((n,var_len))
     self.A_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.A_gpu, Z)
     self.B_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.B_gpu, Z)
     self.S_temp_gpu = cuda.mem_alloc(nbytes)
     
     modFun={}
     self.applyFun = {}
     for x in self.index_nonzero:
         s = self.eqs._function_C_String[self.index_to_varname[x]]
         args_fun =[]
         for i in xrange(var_len):
             args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]")
         modFun[x] = SourceModule("""
             __device__ double f"""+ s +"""
             
             __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len)
             { 
                 
                 int idx = x + blockIdx.x * var_len;
                 S_temp[idx] = 0;
                 B[idx] = f("""+",".join(args_fun)+""");
                 S_temp[idx] = 1;
                 A[idx] = f("""+",".join(args_fun)+""") - B[idx];
                 B[idx] /= A[idx];
                 S_temp[idx] = S_in[idx];
             }
             """)
         self.applyFun[x] = modFun[x].get_function("applyFun")
         self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1))
     
     self.calc_dict = {}
     self.already_calc = {}
    def _initialize_gpu_ds(self):
        """
        Setup GPU arrays.
        """

        self.synapse_state = garray.zeros(
            max(int(self.total_synapses) + len(self.input_neuron_list), 1),
            np.float64)

        if self.total_num_gpot_neurons>0:
            # self.V = garray.zeros(
            #     int(self.total_num_gpot_neurons),
            #     np.float64)
            self.V_host = drv.pagelocked_zeros(
                int(self.total_num_gpot_neurons),
                np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP)
            self.V = garray.GPUArray(self.V_host.shape,
                                     self.V_host.dtype,
                                     gpudata=self.V_host.base.get_device_pointer())
        else:
            self.V = None

        if self.total_num_spike_neurons > 0:
            # self.spike_state = garray.zeros(int(self.total_num_spike_neurons),
            #                                 np.int32)
            self.spike_state_host = drv.pagelocked_zeros(int(self.total_num_spike_neurons),
                            np.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
            self.spike_state = garray.GPUArray(self.spike_state_host.shape,
                                               self.spike_state_host.dtype,
                                               gpudata=self.spike_state_host.base.get_device_pointer())
        self.block_extract = (256, 1, 1)
        if len(self.out_ports_ids_gpot) > 0:
            self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot)
            self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids)

            self._extract_gpot = self._extract_projection_gpot_func()

        if len(self.out_ports_ids_spk) > 0:
            self.out_ports_ids_spk_g = garray.to_gpu(
                (self.out_ports_ids_spk).astype(np.int32))
            self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids)

            self._extract_spike = self._extract_projection_spike_func()

        if self.ports_in_gpot_mem_ind is not None:
            inds = self.sel_in_gpot_ids
            self.inds_gpot = garray.to_gpu(inds)

        if self.ports_in_spk_mem_ind is not None:
            inds = self.sel_in_spk_ids
            self.inds_spike = garray.to_gpu(inds)
Пример #9
0
def allocate(n, dtype=numpy.float32):
    """ allocate context-portable device mapped host memory """
    return drv.pagelocked_zeros(int(n),
                                dtype,
                                order='C',
                                mem_flags=drv.host_alloc_flags.PORTABLE
                                | drv.host_alloc_flags.DEVICEMAP)
Пример #10
0
    def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id,
               npairs):
        """
        Returns the rank matching score
        rt_gpu - rank template gpu object (padded_npairs,)
        srt_gpu - sample rank template gpu object (padded_npairs, padded_samples)
        samp_id - the sample id to compare srt to rt
        npairs - true number of pairs
        b_size - the block size for gpu computation.
        """
        b_size = self.b_size
        gsize = int(padded_npairs / b_size)
        result = self.result = cuda.pagelocked_zeros(
            (gsize, ),
            dtype=np.int32,
            mem_flags=cuda.host_alloc_flags.DEVICEMAP)
        result_gpu = np.intp(
            result.base.get_device_pointer())  #cuda.mem_alloc(result.nbytes)

        func = self.getrmsKern()
        func(rt_gpu,
             srt_gpu,
             np.int32(samp_id),
             np.int32(padded_samples),
             np.int32(npairs),
             result_gpu,
             block=(b_size, 1, 1),
             grid=(int(gsize), ),
             shared=b_size * np.uint32(1).nbytes)
        self.ctx.synchronize()

        return result.sum() / float(npairs)
Пример #11
0
 def getBuff(self, frm, new_r, new_c, b_dtype):
     """
     Generates a numpy array sized (new_r,new_x) of dtype
         b_dtype that contains the np array frm such that
         frm[i,j] == new[i,j] wher new has zeros if
         frm[i,j] is out of bounds.
     """
     try:
         old_r,old_c =  frm.shape
         buff = cuda.pagelocked_zeros((new_r,new_c),b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP)#np.zeros((new_r,new_c),dtype=b_dtype)
         buff[:old_r,:old_c] = frm
     except ValueError:
         #oned
         old_r = frm.shape[0]
         buff = cuda.pagelocked_zeros((new_r,), b_dtype,mem_flags=cuda.host_alloc_flags.DEVICEMAP)# np.zeros((new_r,),dtype=b_dtype)
         buff[:old_r] = frm
     return buff
Пример #12
0
def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False):
	chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32)
	zoom = numpy.float32(zoom)
	iterations = numpy.int32(iterations)
	blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32)
	result = numpy.zeros(dimensions,dtype=numpy.int32)

	#Center position
	position = Vector(position[0]*zoom,position[1]*zoom)
	position = position - (Vector(result.shape[0],result.shape[1])/2)
	position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32)

	#For progress reporting:
	ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter
	ppc[0,0] = 0
	ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to
	#End progress reporting

	#Copy parameters over to device
	chunkS = In(chunkSize)
	posit = In(position)
	blockD = In(blockDim)
	zoo = In(zoom)
	iters = In(iterations)
	res = In(result)

	if not silent:
		print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0]))
	start_time = time.time()

	genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block)
	
	if report:
		total = (dimensions[0]*dimensions[1])
		print "Reporting up to "+str(total)+", "+str(ppc[0,0])
		while ppc[0,0] < ((dimensions[0]*dimensions[1])):
			pct = (ppc[0,0]*100)/(total)
			hashes = "#"*pct
			dashes = "-"*(100-pct)
			print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True),
			time.sleep(0.00001)


	cuda.Context.synchronize()
	if not silent:
		print "Done. "+str(ppc[0,0])

	#Copy result back from device
	cuda.memcpy_dtoh(result, res)

	if not silent: 
		end_time = time.time()
		elapsed_time = end_time-start_time
		print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n")
		print(result)
		
	result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image
	return result
Пример #13
0
def find_component_device(d_v, d_D,  length):
    """

    :param d_v:
    :param d_D:
    :param ecount:
    :return:
    """
    import eulercuda.pyencode as enc
    logger = logging.getLogger('eulercuda.pycomponent.find_component_device')
    logger.info("started.")
    mem_size = length
    d_prevD = np.zeros(mem_size, dtype=np.uintc)
    d_Q = np.zeros_like(d_prevD)
    d_t1 = np.zeros_like(d_prevD)
    d_t2 = np.zeros_like(d_prevD)
    d_val1 = np.zeros_like(d_prevD)
    d_val2 = np.zeros_like(d_prevD)
    sp = np.uintc(0)

    s = np.uintc

    d_D, d_Q = component_step_init(d_v, d_D, d_Q, length)
    s, sp = 1, 1

    sptemp = drv.pagelocked_zeros(4, dtype=np.intc, mem_flags=drv.host_alloc_flags.DEVICEMAP)
    d_sptemp = np.intp(sptemp.base.get_device_pointer())

    while s == sp:
        d_D, d_prevD = d_prevD, d_D

        d_D = component_step1_shortcutting_p1(d_v, d_prevD, d_D, d_Q, length, s)

        d_Q = component_step1_shortcutting_p2(d_v, d_prevD, d_D, d_Q, length, s)

        d_t1, d_t2, d_val1, d_val2 = component_Step2_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_D, d_Q = component_Step2_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_t1, d_t2, d_val1, d_val2 = component_Step3_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_D = component_Step3_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_val1 = component_step4_P1(d_v, d_D, d_val1, length)

        d_D = component_step4_P2(d_v, d_D, d_val1, length)

        sptemp[0] = 0

        d_sptemp = (d_Q, length, d_sptemp, s)

        sp += sptemp[0]

        s += 1

    logger.info("Finished. Leaving.")
    return d_D
Пример #14
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(drv.Out(dest),
                      a_gpu,
                      b_gpu,
                      block=shape + (1, ),
                      stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest - a * b) == 0
Пример #15
0
    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=np.float32)
        b = drv.pagelocked_zeros(shape, dtype=np.float32)
        a[:] = np.random.randn(*shape)
        b[:] = np.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
Пример #16
0
 def getBuff(self, frm, new_r, new_c, b_dtype):
     """
     Generates a numpy array sized (new_r,new_x) of dtype
         b_dtype that contains the np array frm such that
         frm[i,j] == new[i,j] wher new has zeros if
         frm[i,j] is out of bounds.
     """
     try:
         old_r, old_c = frm.shape
         buff = cuda.pagelocked_zeros(
             (new_r, new_c),
             b_dtype,
             mem_flags=cuda.host_alloc_flags.DEVICEMAP
         )  #np.zeros((new_r,new_c),dtype=b_dtype)
         buff[:old_r, :old_c] = frm
     except ValueError:
         #oned
         old_r = frm.shape[0]
         buff = cuda.pagelocked_zeros(
             (new_r, ), b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP
         )  # np.zeros((new_r,),dtype=b_dtype)
         buff[:old_r] = frm
     return buff
Пример #17
0
 def __call__(self):
     spikes = self.collected_spikes[:self.nspikes]
     total_neurons = self.net.total_neurons
     if self.use_gpu:
         if not hasattr(self, 'spikes_gpu'):
             spikes_bool = drv.pagelocked_zeros(total_neurons, dtype=uint32)
             spikes_bool[spikes] = True
             spikes_gpu = pycuda.gpuarray.to_gpu(spikes_bool)
             spikes_gpu_ptr = int(int(spikes_gpu.gpudata))
             self.spikes_bool = spikes_bool
             self.spikes_gpu = spikes_gpu
             self.spikes_gpu_ptr = spikes_gpu_ptr
         else:
             spikes_bool = self.spikes_bool
             spikes_bool[:] = False
             spikes_bool[spikes] = True
             spikes_gpu = self.spikes_gpu
             pycuda.driver.memcpy_htod(spikes_gpu.gpudata, spikes_bool)
             spikes_gpu_ptr = self.spikes_gpu_ptr
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_gpu_ptr,
                                               total_neurons)
         if not hasattr(self, 'acc'):
             self.acc = acc = drv.pagelocked_zeros(total_neurons,
                                                   dtype=float32)
         else:
             acc = self.acc
         pycuda.driver.memcpy_dtoh(acc, acc_ptr)
     else:
         spikes_ptr = spikes.ctypes.data
         spikes_len = len(spikes)
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_ptr, spikes_len)
         acc = numpy_array_from_memory(acc_ptr, total_neurons, float32)
     for _, targetvar, targetslice in self.net.nemo_propagate_targets:
         targetvar += acc[targetslice]
     self.nspikes = 0
Пример #18
0
 def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id, npairs):
     """
     Returns the rank matching score
     rt_gpu - rank template gpu object (padded_npairs,)
     srt_gpu - sample rank template gpu object (padded_npairs, padded_samples)
     samp_id - the sample id to compare srt to rt
     npairs - true number of pairs
     b_size - the block size for gpu computation.
     """
     b_size = self.b_size
     gsize = int(padded_npairs/b_size)
     result = self.result= cuda.pagelocked_zeros((gsize,), dtype=np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
     result_gpu = np.intp(result.base.get_device_pointer()) #cuda.mem_alloc(result.nbytes)
      
     func = self.getrmsKern()
     func( rt_gpu, srt_gpu, np.int32(samp_id), np.int32(padded_samples), np.int32(npairs), result_gpu, block=(b_size,1,1), grid=(int(gsize),), shared=b_size*np.uint32(1).nbytes )
     self.ctx.synchronize()
     
     return result.sum()/float(npairs)
Пример #19
0
def main():
    params0 = np.ones((args.dim, ), dtype=np.float32) / (np.sqrt(args.dim))

    loss, params, grad_cached, grad_assign_op = create_net('net1', params0)

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    lr = 0.01
    import pycuda.driver as drv
    drv.init()
    print("%d device(s) found." % drv.Device.count())
    current_dev = drv.Device(0)  #device we are working on
    ctx = current_dev.make_context()  #make a working context
    ctx.push()  #let context make the lead

    params1 = drv.pagelocked_zeros((args.dim, ), dtype=np.float32)

    for i in range(10):
        loss0 = loss.eval()
        print(loss0)

        with timeit('step'):
            sess.run(grad_assign_op)

        with timeit('fetch'):
            grad0 = sess.run(grad_cached)

        # takes 75ms, 33ms is on allocation, 16ms on multiplication
        with timeit('add'):
            params0 -= grad0 * lr

        with timeit('feed'):
            #      params.load(params0)
            sess.run(params.initializer,
                     feed_dict={params.initial_value: params1})

    for key, times in global_timeit_dict.items():
        summarize_time(key, times)

    assert abs(loss0 - 0.69513524) < 0.01
    print('test passed')
Пример #20
0
 def __init__(self, N, model, threshold=None, reset=NoReset(),
              init=None, refractory=0 * msecond, level=0,
              clock=None, order=1, implicit=False, unit_checking=True,
              max_delay=0 * msecond, compile=False, freeze=False, method=None,
              precision='double', maxblocksize=512, forcesync=False, pagelocked_mem=True,
              gpu_to_cpu_vars=None, cpu_to_gpu_vars=None):
     eqs = model
     eqs.prepare()
     NeuronGroup.__init__(self, N, eqs, threshold=threshold, reset=reset,
                          init=init, refractory=refractory, level=level,
                          clock=clock, order=order, compile=compile, freeze=freeze, method=method)
     self.precision = precision
     if self.precision == 'double':
         self.precision_dtype = float64
         self.precision_nbytes = 8
     else:
         self.precision_dtype = float32
         self.precision_nbytes = 4
     self.clock = guess_clock(clock)
     if gpu_to_cpu_vars is None and cpu_to_gpu_vars is None:
         self._state_updater = GPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
                                                        forcesync=forcesync)
     else:
         cpu_to_gpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
                             self.get_var_index(var) * len(self),
                             (self.get_var_index(var) + 1) * len(self)) for var in cpu_to_gpu_vars]
         gpu_to_cpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
                             self.get_var_index(var) * len(self),
                             (self.get_var_index(var) + 1) * len(self)) for var in gpu_to_cpu_vars]
         self._state_updater = UserControlledGPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
                                                        gpu_to_cpu_vars=gpu_to_cpu_vars, cpu_to_gpu_vars=cpu_to_gpu_vars)
     if pagelocked_mem:
         self._S = GPUBufferedArray(drv.pagelocked_zeros(self._S.shape, dtype=self.precision_dtype))
     else:
         self._S = GPUBufferedArray(array(self._S, dtype=self.precision_dtype))
     self._gpuneurongroup_init_finished = True
Пример #21
0
        for i in range(1, ngpu):
            result = np.concatenate((result, mpi.world.recv(i, 10)))
        for i in xrange(ny):
            print result[:nx, i], '\t', result[nx:2 * nx,
                                               i], '\t', result[2 * nx:, i]


if __name__ == '__main__':
    cuda.init()
    ngpu = cuda.Device.count()
    ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST)

    nx, ny = 6, 5

    a_side_f = cuda.pagelocked_zeros(ny,
                                     np.float32,
                                     mem_flags=cuda.host_alloc_flags.DEVICEMAP)

    a = np.zeros((nx, ny), 'f')
    if mpi.rank == 0:
        a[-2, :] = 1.5
    elif mpi.rank == 1:
        a[1, :] = 2.0
        a[-2, :] = 2.5
    elif mpi.rank == 2:
        a[1, :] = 3.0
    a_gpu = cuda.to_device(a)

    if mpi.rank == 0: print 'dev 0', '\t' * 5, 'dev 1', '\t' * 5, 'dev 2'
    print_arr_gpus(ngpu, nx, ny, a_gpu)
	# prepare for plot
	import matplotlib.pyplot as plt
	plt.ion()
	fig = plt.figure(figsize=(15,7))
	'''
	fig = plt.figure(figsize=(10,13))
	ax1 = fig.add_subplot(3,1,1)
	ax1.imshow(fdtd.cex[nx/2,:,:].T, origin='lower', interpolation='nearest')
	ax2 = fig.add_subplot(3,1,2)
	ax2.imshow(fdtd.cey[nx/2,:,:].T, origin='lower', interpolation='nearest')
	ax3 = fig.add_subplot(3,1,3)
	ax3.imshow(fdtd.cez[nx/2,:,:].T, origin='lower', interpolation='nearest')
	plt.show()
	'''
	#ez_tmp = np.ones((800,ny,nz), 'f')
	ez_tmp = cuda.pagelocked_zeros((800,ny,nz), 'f')

	ax1 = fig.add_subplot(4,1,1)
	imag1 = ax1.imshow(ez_tmp[:,:,nz/2].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest')
	ax2 = fig.add_subplot(4,1,2)
	imag2 = ax2.imshow(ez_tmp[:,ny/2,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest')
	ax3 = fig.add_subplot(2,2,3)
	imag3 = ax3.imshow(ez_tmp[500,:,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2., interpolation='nearest')
	ax4 = fig.add_subplot(2,2,4)
	imag4 = ax4.imshow(ez_tmp[700,:,:].T, cmap=plt.cm.jet, origin='lower', vmin=-2, vmax=2, interpolation='nearest')

	# ez^2 sum
	s1 = np.zeros(200)
	s2 = np.zeros(200)
	ez_tmp1 = cuda.pagelocked_zeros((ny,nz), 'f')
	ez_tmp2 = cuda.pagelocked_zeros((ny,nz), 'f')
Пример #23
0
	def run_simulation(self):

		# setup data#{{{
		data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T }
		base_shape = self.n_work_items,
		for name, shape in dict(
			tavg0=(self.exposures, self.args.n_regions,),
			tavg1=(self.exposures, self.args.n_regions,),
			state=(self.buf_len, self.states * self.args.n_regions),
			).items():
			# memory error exception for compute device
			try:
				data[name] = np.zeros(shape + base_shape, 'f')
			except MemoryError as e:
				self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large '
							 'for this compute device',
							 e, self.args.n_sweep_arg0, self.args.n_sweep_arg1)
				exit(1)

		gpu_data = self.make_gpu_data(data)#{{{

		# setup CUDA stuff#{{{
		step_fn = self.make_kernel(
			source_file=self.args.filename,
			warp_size=32,
			# block_dim_x=self.args.n_sweep_arg0,
			# ext_options=preproccesor_defines,
			# caching=args.caching,
			args=self.args,
			lineinfo=self.args.lineinfo,
			nh=self.buf_len,
			)#}}}

		# setup simulation#{{{
		tic = time.time()

		n_streams = 32
		streams = [drv.Stream() for i in range(n_streams)]
		events = [drv.Event() for i in range(n_streams)]
		tavg_unpinned = []

		try:
			tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32)
		except drv.MemoryError as e:
			self.logger.error(
				'%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU',
				e, self.params.size)
			exit(1)

		# determine optimal grid recursively
		def dog(fgd):
			maxgd, mingd = max(fgd), min(fgd)
			maxpos = fgd.index(max(fgd))
			if (maxgd - 1) * mingd * bx * by >= nwi:
				fgd[maxpos] = fgd[maxpos] - 1
				dog(fgd)
			else:
				return fgd

		# n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y
		# form an optimal grid recursively
		bx, by = self.args.blockszx, self.args.blockszy
		nwi = self.n_work_items
		rootnwi = int(np.ceil(np.sqrt(nwi)))
		gridx = int(np.ceil(rootnwi / bx))
		gridy = int(np.ceil(rootnwi / by))

		final_block_dim = bx, by, 1

		fgd = [gridx, gridy]
		dog(fgd)
		final_grid_dim = fgd[0], fgd[1]

		assert gridx * gridy * bx * by >= nwi

		self.logger.info('history shape %r', gpu_data['state'].shape)
		self.logger.info('gpu_data %s', gpu_data['tavg0'].shape)
		self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))
		self.logger.info('final block dim %r', final_block_dim)
		self.logger.info('final grid dim %r', final_grid_dim)

		# run simulation#{{{
		nstep = self.args.n_time

		self.gpu_mem_info() if self.args.verbose else None

		try:
			for i in tqdm.trange(nstep, file=sys.stdout):

				try:
					event = events[i % n_streams]
					stream = streams[i % n_streams]

					if i > 0:
						stream.wait_for_event(events[(i - 1) % n_streams])

					step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len),
							np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt),
							gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'],
							gpu_data['tavg%d' % (i%2,)],
							block=final_block_dim, grid=final_grid_dim)

					event.record(streams[i % n_streams])
				except drv.LaunchError as e:
					self.logger.error('%s', e)
					exit(1)

				tavgk = 'tavg%d' % ((i + 1) % 2,)

				# async wrt. other streams & host, but not this stream.
				if i >= n_streams:
					stream.synchronize()
					tavg_unpinned.append(tavg[i % n_streams].copy())

				drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream)

			# recover uncopied data from pinned buffer
			if nstep > n_streams:
				for i in range(nstep % n_streams, n_streams):
					stream.synchronize()
					tavg_unpinned.append(tavg[i].copy())

			for i in range(nstep % n_streams):
				stream.synchronize()
				tavg_unpinned.append(tavg[i].copy())

		except drv.LogicError as e:
			self.logger.error('%s. Check the number of states of the model or '
						 'GPU block shape settings blockdim.x/y %r, griddim %r.',
						 e, final_block_dim, final_grid_dim)
			exit(1)
		except drv.RuntimeError as e:
			self.logger.error('%s', e)
			exit(1)


		# self.logger.info('kernel finish..')
		# release pinned memory
		tavg = np.array(tavg_unpinned)

		# also release gpu_data
		self.release_gpumem(gpu_data)

		self.logger.info('kernel finished')
		return tavg
    exec_time = {
        'update_h': np.zeros(tmax),
        'mpi_recv_h': np.zeros(tmax),
        'memcpy_htod_h': np.zeros(tmax),
        'mpi_send_h': np.zeros(tmax),
        'memcpy_dtoh_h': np.zeros(tmax),
        'update_e': np.zeros(tmax),
        'mpi_recv_e': np.zeros(tmax),
        'memcpy_htod_e': np.zeros(tmax),
        'mpi_send_e': np.zeros(tmax),
        'memcpy_dtoh_e': np.zeros(tmax),
        'src_e': np.zeros(tmax)
    }

# main loop
ey_tmp = cuda.pagelocked_zeros((ny, nz), 'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
for tn in xrange(1, tmax + 1):
    if rank == 1: start.record()
    for i, bpg in enumerate(bpg_list):
        update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args)

    if rank == 0:
        cuda.memcpy_dtoh(
            hy_tmp,
            int(hy_gpu) + (nx - 1) * ny * nz * np.nbytes['float32'])
        cuda.memcpy_dtoh(
            hz_tmp,
            int(hz_gpu) + (nx - 1) * ny * nz * np.nbytes['float32'])
def main():
    #Set up global timer
    tot_time = time.time()

    #Define constants
    BankSize = 8 # Do not go beyond 8!
    WarpSize = 32 #Do not change...
    DimGridX = 19
    DimGridY = 19
    BlockDimX = 256
    BlockDimY = 256
    SearchSpaceSize = 2**24 #BlockDimX * BlockDimY  * 32
    FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    AlignedByteLengthGenome = 4

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":AlignedByteLengthGenome,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":WarpSize,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))
    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix

    #Set-up genomes
    #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_size_h = drv.mem_alloc(fitness_size.nbytes)
    #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0)
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize)
    BlockCounter = 0
    print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles
    #quit()

    #SET UP PROCESSING
    histo = {}
     
    #INITIALISATION
    CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY))
    print "Finished Curand kernel, starting main kernel..."

    #FIRST GENERATION
    proc_time = time.time()
    print "Starting first generation..."
    start = drv.Event()
    stop = drv.Event()
    start.record()
    Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)    
    print "Copying..."
    drv.memcpy_dtoh(fitness_size, fitness_size_h)
    drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
    drv.memcpy_dtoh(grids, grids_h)

    #INTERMEDIATE GENERATION
    for i in range(MaxBlockCycles-1):
        print "Starting generation: ", i+1
        start = drv.Event()
        stop = drv.Event()
        start.record()
        Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
        "Processing..."
        for j in range(grids.shape[0]):
#            if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22):
            if fitness_hash[j] in histo: 
                histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1)
            else:
                histo[fitness_hash[j]] = (grids[j], 1)

        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3))
        print "Copying..."
        drv.memcpy_dtoh(fitness_size, fitness_size_h)
        drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
        drv.memcpy_dtoh(grids, grids_h)

    #FINAL PROCESSING
    "Processing..."
    for i in range(grids.shape[0]):
        if fitness_hash[i] in histo:
            histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1)
        else:
            histo[fitness_hash[i]] = (grids[i], 1)

    print "Done!"

    #TIMING RESULTS
    print "Total time including set-up: ", (time.time() - tot_time)
    print "Total Processing time: ", (time.time() - proc_time)

    #OUTPUT    
    print histo
		result = cuda.from_device(a_gpu, (nx,ny), 'float32')
		print ngpu
		for i in range(1,ngpu): 
			result = np.concatenate((result, mpi.world.recv(i,10)))
		for i in xrange(ny):
			print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]


if __name__ == '__main__':
	cuda.init()
	ngpu = cuda.Device.count()
	ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST)
	
	nx, ny = 6, 5

	a_side_f = cuda.pagelocked_zeros(ny, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)

	a = np.zeros((nx,ny),'f')
	if mpi.rank == 0: 
		a[-2,:] = 1.5
	elif mpi.rank == 1: 
		a[1,:] = 2.0
		a[-2,:] = 2.5
	elif mpi.rank == 2: 
		a[1,:] = 3.0
	a_gpu = cuda.to_device(a)

	if mpi.rank == 0: print 'dev 0','\t'*5,'dev 1','\t'*5,'dev 2'
	print_arr_gpus(ngpu, nx, ny, a_gpu)

	if mpi.rank == 0:
Пример #27
0
from wavemoth.cuda.profile import cuda_profile

N = 4092

nside = 2048
npix = 12*2048**2
nrings = 4 * nside - 1
lmax = 2 * nside

#x = np.asarray(np.random.rand(N), np.float32)
#xf = np.fft.fft(x)
#x_gpu = gpuarray.to_gpu(x)
#xf_gpu = gpuarray.empty(N/2+1, np.complex64)

map = drv.pagelocked_zeros(npix, np.float64)
buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128)

map_gpu = drv.mem_alloc(npix * 8)
buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16)

drv.memcpy_htod(map_gpu, map)

from wavemoth.cuda import cufft

print 'ctoring plan'
plan = cufft.HealpixCuFFTPlan(2048, 8)

repeats = 1
print 'plan ctored'
with cuda_profile() as prof:
		ey_gpu = cuda.to_device(f)
		ez_gpu = cuda.to_device(f)
		hx_gpu = cuda.to_device(f)
		hy_gpu = cuda.to_device(f)
		hz_gpu = cuda.to_device(f)

		cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) )
		cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) )
		cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) )
		chx_gpu = cuda.to_device( set_c(f,(None,0,0)) )
		chy_gpu = cuda.to_device( set_c(f,(0,None,0)) )
		chz_gpu = cuda.to_device( set_c(f,(0,0,None)) )

		# pinned memory allocation for zero-copy
		if myrank != 1:
			ex_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			ey_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hx_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hy_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			'''
			ex_send_map = ex_send.get_device_pointer()
			ey_send_map = ey_send.get_device_pointer()
			hx_recv_map = hx_recv.get_device_pointer()
			hy_recv_map = hy_recv.get_device_pointer()
			'''
		if myrank != 3:
			ex_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			ey_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hx_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hy_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			'''
Пример #29
0
model_file = 'erfnet_nobn.uff'
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
) as network, trt.UffParser() as parser:
    # (num_channels, h, w)
    parser.register_input("inputs/X", (1, 256, 256))
    parser.register_output("up23/BiasAdd")
    parser.parse(model_file, network)

    builder.max_batch_size = max_batch_size
    builder.max_workspace_size = 1 << 20  # This determines the amount of memory available to the builder when building an optimized engine and should generally be set as high as possible.
    with builder.build_cuda_engine(network) as engine:
        with engine.create_execution_context() as context:
            # h_input = cuda.pagelocked_zeros(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
            h_input = np.ones((1, 256, 256))
            h_output = cuda.pagelocked_zeros(trt.volume(
                engine.get_binding_shape(1)),
                                             dtype=np.float32)
            # Allocate device memory for inputs and outputs.
            d_input = cuda.mem_alloc(h_input.nbytes)
            d_output = cuda.mem_alloc(h_output.nbytes)
            # Create a stream in which to copy inputs/outputs and run inference.
            stream = cuda.Stream()

            # Transfer input data to the GPU.
            cuda.memcpy_htod_async(d_input, h_input, stream)
            # Run inference.
            context.execute_async(bindings=[int(d_input),
                                            int(d_output)],
                                  stream_handle=stream.handle)
            # Transfer predictions back from the GPU.
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
Пример #30
0
# Setup the kernel
mod = SourceModule("""
__global__ void add(float *a, float *b, float *c, float *c_map) {
	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	float val;

	val = a[idx] + b[idx];
	c[idx] = val;
	c_map[idx] = val;
}
""")
add = mod.get_function("add")

# Memory allocation
nx = 1024
a = np.random.randn(nx).astype(np.float32)
b = np.random.randn(nx).astype(np.float32)
c = np.zeros_like(a)

a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)

# Page-locked host memory allocation for zero-copy
c_map = cuda.pagelocked_zeros(nx, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)

add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) )
assert( np.linalg.norm( (a+b)-c ) == 0 )
assert( np.linalg.norm( (a+b)-c_map ) == 0 )

ctx.pop()
	# measure kernel execution time
	from datetime import datetime
	t1 = datetime.now()
	flop = 3*(nx*ny*nz*30)*tgap
	flops = np.zeros(tmax/tgap+1)
	start, stop = cuda.Event(), cuda.Event()
	start.record()

elif rank == 1:
	start, stop = cuda.Event(), cuda.Event()
	exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 
			'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 
			'src_e':np.zeros(tmax)}

# main loop
ey_tmp = cuda.pagelocked_zeros((ny,nz),'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
for tn in xrange(1, tmax+1):
	if rank == 1: start.record()
	for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args)

	if rank == 0:
		cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		comm.Send(hy_tmp, 1, 20)
		comm.Send(hz_tmp, 1, 21)
	elif rank == 1:
		stop.record()
		stop.synchronize()
Пример #32
0
    print("BGRA2NV12=%s" % BGRA2NV12)

    w = roundup(512, 32)
    h = roundup(512, 32)

    log("w=%s, h=%s", w, h)

    cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16)
    log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)),
        inputPitch)
    #allocate CUDA NV12 buffer (on device):
    cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16)
    log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch)

    #host buffers:
    inputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2,
                                          dtype=numpy.byte)
    log("inputBuffer=%s", inputBuffer)

    outputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2,
                                           dtype=numpy.byte)
    log("outputBuffer=%s", outputBuffer)

    #populate host buffer with random data:
    buf = inputBuffer.data
    for y in range(h * 3 / 2):
        dst = y * inputPitch
        #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane))
        for x in range(w):
            buf[dst + x] = numpy.byte((x + y) % 256)

    #copy input buffer to CUDA buffer:
Пример #33
0
    def _pre_run(self):
        assert (self.LPU_obj)
        assert all([var in self.memory_manager.variables
                    for var in self.variables.keys()]),\
               (list(self.memory_manager.variables), list(self.variables.keys()))
        self.add_inds_func = {}

        if self.input_file is not None:
            self.input_file_handle = h5py.File(self.input_file, 'w')
            self.input_file_handle.create_dataset('metadata', (), 'i')
            self.input_file_handle['metadata'].attrs['dt'] = self.dt
            self.input_file_handle['metadata'].attrs[
                'sample_interval'] = self.input_interval
            self.input_file_handle['metadata'].attrs[
                'DateCreated'] = datetime.now().isoformat()
            for k, v in self.metadata.items():
                self.input_file_handle['metadata'].attrs[k] = v

        for var, d in self.variables.items():
            v_dict = self.memory_manager.variables[var]
            uids = []
            inds = []
            for uid in d['uids']:
                cd = self.LPU_obj.conn_dict[uid]
                assert (var in cd)
                pre = cd[var]['pre'][0]
                inds.append(v_dict['uids'][pre])

            if isinstance(var, str):
                self.dest_inds[var] = garray.to_gpu(np.array(inds, np.int32))
                self.dtypes[var] = v_dict['buffer'].dtype
                self._d_input[var] = garray.zeros(len(d['uids']),
                                                  self.dtypes[var])
                if self.memory_mode == 'cpu':
                    self.variables[var]['input'] = cuda.pagelocked_zeros(
                        len(d['uids']), self.dtypes[var])
                elif self.memory_mode == 'gpu':
                    self.variables[var]['input'] = self._d_input[var]
                self.add_inds_func[var] = get_inds_kernel(
                    self.dest_inds[var].dtype, v_dict['buffer'].dtype)

            elif isinstance(var, tuple):
                n = len(var)
                new_inds = list(
                    itertools.chain.from_iterable(
                        [[ind * n + i for i in range(n)] for ind in inds]))
                self.dest_inds[var] = garray.to_gpu(
                    np.array(new_inds, np.int32))
                self.dtypes[var] = v_dict['buffer'].dtype
                self._d_input[var] = garray.zeros(
                    len(d['uids']) * n, self.dtypes[var])
                if self.memory_mode == 'cpu':
                    self.variables[var]['input'] = cuda.pagelocked_zeros(
                        len(d['uids']) * n, self.dtypes[var])
                elif self.memory_mode == 'gpu':
                    self.variables[var]['input'] = self._d_input[var]
                self.add_inds_func[var] = get_inds_kernel(
                    self.dest_inds[var].dtype, v_dict['buffer'].dtype)

            else:
                raise TypeError(
                    'variable name must either be a str or a tuple of str')

            if self.input_file is not None:
                self.input_file_handle.create_dataset('{}/uids'.format(var),
                                                      data=np.array(d['uids'],
                                                                    dtype='S'))
                if isinstance(var, str):
                    self.input_file_handle.create_dataset(
                        '{}/data'.format(var), (0, len(d['uids'])),
                        d['input'].dtype,
                        maxshape=(None, len(d['uids'])))
                elif isinstance(var, tuple):
                    n = len(var)
                    for ind_var in var:
                        self.input_file_handle.create_dataset(
                            '{}/data/{}'.format(var,
                                                ind_var), (0, len(d['uids'])),
                            d['input'].dtype,
                            maxshape=(None, len(d['uids'])))
                else:
                    raise TypeError(
                        'variable name must either be a str or a tuple of str')
        self.record_count = 0

        self.pre_run()
Пример #34
0
    def prepare(self):
        '''
		From Hines 1984 paper, discrete formula is:
		A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i)
       
		A_plus: i->i+1
		A_minus: i->i-1
		
        This gives the following tridiagonal system:
        A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i)
        
        Boundaries, one simple possibility (sealed ends):
        -(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n)
        A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0)
        '''
        mid_diameter = zeros(len(self.neuron))  # mid(i) : (i-1) <-> i
        mid_diameter[1:] = .5 * (self.neuron.diameter[:-1] +
                                 self.neuron.diameter[1:])

        self.Aplus = zeros(len(self.neuron))  # A+ i -> j = Aplus(j)
        self.Aminus = zeros(len(self.neuron))  # A- i <- j = Aminus(j)
        self.Aplus[1] = mid_diameter[1]**2 / (4 * self.neuron.diameter[1] *
                                              self.neuron.length[1]**2 *
                                              self.neuron.Ri)
        self.Aplus[2:] = mid_diameter[2:]**2 / (
            4 * self.neuron.diameter[1:-1] * self.neuron.length[1:-1]**2 *
            self.neuron.Ri)
        self.Aminus[1:] = mid_diameter[1:]**2 / (4 * self.neuron.diameter[1:] *
                                                 self.neuron.length[1:]**2 *
                                                 self.neuron.Ri)

        self.neuron.index = zeros(
            len(self.neuron), int
        )  # gives the index of the branch containing the current compartment
        self.neuron.branches = []  # (i,j,bp,ante,ante_index,pointType)
        # i is the first compartment
        # bp is the last, a branch point
        # j is the end of the "inner branch". j = bp-1
        # ante is the branch point to which i is connected

        self.neuron.BPcount = 0  # number of branch points (or branches). = len(self.neuron.branches)
        self.neuron.long_branches_count = 0  # number of branches with len(branch) > 1

        #self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
        #self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
        #self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)

        self.bL = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64)
        self.bR = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64)
        #self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
        self.ab = zeros((3, len(self.neuron)))
        self.ab0 = zeros(len(self.neuron))
        self.ab1 = cuda.pagelocked_zeros((len(self.neuron)), numpy.float64)
        self.ab2 = zeros(len(self.neuron))
        self.ab1_base = zeros(len(self.neuron))
        #self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64)

        self.mTrunc = 0  # used to truncate vL and vR
        self.delta_list = zeros(len(self.neuron))  #used to find mTrunc

        # prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus
        self.prepare_branch(self.neuron.morphology, mid_diameter, 0)

        # linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account.
        self.P = zeros((self.neuron.BPcount, self.neuron.BPcount))
        self.B = zeros(self.neuron.BPcount)
        self.solution_bp = zeros(self.neuron.BPcount)

        self.gtot = zeros(len(self.neuron))
        self.I0 = zeros(len(self.neuron))
        self.i_list = []
        self.j_list = []
        self.i_list_bis = []
        self.j_list_bis = []
        new_tridiag = True
        self.bp_list = []
        self.pointType_list = []
        self.pointTypeAnte_list = []
        self.index_ante_list0 = []
        self.index_ante_list1 = []
        self.index_ante_list2 = []
        self.ante_list = []
        self.post_list = []
        self.ante_list_idx = []
        self.post_list_idx = []
        self.id = []
        self.test_list = []
        temp = zeros(self.neuron.BPcount)
        self.ind0 = []
        self.ind_bctype_0 = []
        for index, (i, j, bp, ante, index_ante,
                    pointType) in enumerate(self.neuron.branches):
            self.i_list.append(i)
            self.j_list.append(j)
            if new_tridiag:
                self.i_list_bis.append(i)
                ii = i
            else:
                ii = self.i_list[-1]
            if j - ii + 1 > 2:
                self.j_list_bis.append(j)
                new_tridiag = True
            else:
                new_tridiag = False
            self.bp_list.append(bp)
            self.pointType_list.append(max(1, pointType))
            self.pointTypeAnte_list.append(max(1, self.neuron.bc[ante]))
            temp[index] = index_ante
            self.id.append(index)
            if (j - i + 2 > 1):
                self.test_list.append(1)
            else:
                self.test_list.append(0)
            for x in xrange(j - i + 2):
                self.ante_list.append(ante)
                self.post_list.append(bp)
                self.ante_list_idx.append(index_ante)
                self.post_list_idx.append(index)
            if index_ante == 0 and index != 0:
                self.ind0.append(index)
            if pointType == 0:
                self.ind_bctype_0.append(bp)

        self.ante_arr = numpy.array(self.ante_list_idx)
        self.post_arr = numpy.array(self.post_list_idx)

        self.index_ante_list1, self.ind1 = numpy.unique(temp,
                                                        return_index=True)
        self.ind1 = numpy.sort(self.ind1)
        self.index_ante_list1 = temp[self.ind1]
        self.index_ante_list1 = list(self.index_ante_list1)
        self.ind2 = []
        for x in xrange(self.neuron.BPcount):
            self.ind2.append(x)
        self.ind2 = numpy.delete(self.ind2, self.ind1, None)
        self.ind2 = numpy.setdiff1d(self.ind2, self.ind0, assume_unique=True)
        self.index_ante_list2 = temp[self.ind2]
        self.index_ante_list2 = list(self.index_ante_list2)

        self.index_ante_list = list(temp)
        self.Aminus_bp = self.Aminus[self.bp_list]
        self.Aminus_bp[:] *= self.pointType_list[:]
        self.Aplus_i = self.Aplus[self.i_list]
        self.Aplus_i[:] *= self.pointTypeAnte_list[:]
        #--------------------------------------------------------GPU------------------------
        n = len(self.neuron)

        mod = SourceModule("""
        __global__ void updateAB_gtot(double *ab1, double *ab1_base, double *gtot)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          
          ab1[idx] = ab1_base[idx] - gtot[idx];
          ab1[idx+gridDim.x] = ab1_base[idx] - gtot[idx];
          ab1[idx+2*gridDim.x] = ab1_base[idx] - gtot[idx];
        }
        
        __global__ void updateBD(double *bd, double *Cm, double dt,double *v, double *I0)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          
          bd[idx] = - Cm[idx] / dt * v[idx] - I0[idx];
        }
        
        __global__ void finalizeFun(double *v, double *v_bp, int *ante,int *post, double *b, int m)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int idx_a = ante[idx];
          int idx_p = post[idx];
          
          v[idx] = b[idx + m] * v_bp[idx_a] + b[idx + 2*m] * v_bp[idx_p] + b[idx]; // vL, vR, d
        }
        
        __global__ void finalizeFunBis(double *v, double *v_bp, int *GPU_data_int)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int bp = GPU_data_int[4 * idx + 3]; 
          v[bp] = v_bp[idx];
        }
        
        __global__ void initPB(double *P, double *B, int BPcount)
        {
        	int idx = threadIdx.x + blockIdx.x * blockDim.x;
        	int idy = threadIdx.x + blockIdx.y * blockDim.y;
        	
        	P[idx + idy * BPcount] = 0.0;
        	B[idx] = 0.0;
        }
        
        //GPU_data_int is : ante_list, i_list, j_list, bp_list
        //GPU_data_double is : test_list, Aplus, Aminus
        __global__ void fillPB(double *P, double *B, double *b, double *Cm_l, double *gtot_l, int *GPU_data_int,
                        double *GPU_data_double, double *I0_l, double *v_l, int BPcount, double dt, int m)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int idx_ante = GPU_data_int[4 * idx];
          int i = GPU_data_int[4 * idx + 1];
          int j = GPU_data_int[4 * idx + 2];
          int bp = GPU_data_int[4 * idx + 3];
          double test = GPU_data_double[3 * idx];
          double Aplus = GPU_data_double[3 * idx + 1];
          double Aminus = GPU_data_double[3 * idx + 2];
          double Cm = Cm_l[bp];
          double gtot = gtot_l[bp];
          double I0 = I0_l[bp];
          double v_bp = v_l[bp];
          double vLright = b[j + m] * test;
          double vRleft = b[i + 2*m] * test;
          double vRright = b[j + 2*m] * test;
          double dright = b[j] * test;
          
          B[idx] += -Cm/dt * v_bp - I0 -Aminus * dright;
          P[idx * BPcount + idx] += -Cm/dt - gtot + Aminus * (vRright - 1.0);
          P[idx * BPcount + idx_ante] += Aminus * vLright;
          P[idx_ante * BPcount + idx] += Aplus * vRleft;
        }
        
        __global__ void fillPB_bis(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double,
                        int BPcount, int *indices, int m)
        { 
          int idx_temp = threadIdx.x + blockIdx.x * blockDim.x;
          int idx = indices[idx_temp];
          int idx_ante = GPU_data_int[4 * idx];
          int i = GPU_data_int[4 * idx + 1];
          int test = GPU_data_double[3 * idx];
          double Aplus = GPU_data_double[3 * idx + 1];
          double vLleft = b[i + m] * test;
          double dleft = b[i] * test;
          
          B[idx_ante] += - Aplus * dleft;
          P[idx_ante * (BPcount + 1)] += Aplus * (vLleft - 1.0);
        }
        
        __global__ void badFillPB_0(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double,
                        int *indices, double *Cm_l, double *gtot_l, double *I0_l, double *v_l, int len_indices, int m, double dt)
        { 
          double Cm = Cm_l[0];
          double gtot = gtot_l[0];
          double I0 = I0_l[0];
          double v_0 = v_l[0];
          
          B[0] = - Cm/dt * v_0 - I0;
		  P[0] = - Cm/dt - gtot;
		  
		  int idx;
		  int i;
		  int test;
		  double Aplus;
		  double vLleft;
		  double dleft;
          for (int idx_temp=0;idx_temp<len_indices;idx_temp++)
          {
          	idx = indices[idx_temp];
          	i = GPU_data_int[4 * idx + 1];
          	test = GPU_data_double[3 * idx];
          	Aplus = GPU_data_double[3 * idx + 1];
          	vLleft = b[i + m] * test;
          	dleft = b[i] * test;
          
			P[0] += Aplus * (vLleft - 1.0);
			B[0] += - Aplus * dleft;
		  }
        }
        
        __global__ void resetPB_type0(double *P, double *B, double *v, int *indices,int BPcount)
        {
        	int idx = indices[threadIdx.x] + blockIdx.x * blockDim.x;
        	int idy = threadIdx.x + blockIdx.y * blockDim.y;
        	
        	P[idx + idy * BPcount] = 0.0;
        	P[idx + idx * BPcount] = 1.0;
        	B[idx] = v[idx];
        }
        """)

        self.updateAB_gtot = mod.get_function("updateAB_gtot")
        self.updateAB_gtot.prepare(["P", "P", 'P'], block=(1, 1, 1))

        self.updateBD = mod.get_function("updateBD")
        self.updateBD.prepare(["P", "P", 'd', 'P', 'P'], block=(1, 1, 1))

        self.finalizeFun = mod.get_function("finalizeFun")
        self.finalizeFun.prepare(['P', 'P', 'P', 'P', 'P', 'i'],
                                 block=(1, 1, 1))

        self.finalizeFunBis = mod.get_function("finalizeFunBis")
        self.finalizeFunBis.prepare(['P', 'P', 'P'], block=(1, 1, 1))

        self.initPB = mod.get_function("initPB")
        self.initPB.prepare(['P', "P", 'i'], block=(1, 1, 1))

        self.fillPB = mod.get_function("fillPB")
        self.fillPB.prepare(
            ["P", "P", 'P', "P", 'P', 'P', 'P', 'P', 'P', 'i', 'd', 'i'],
            block=(1, 1, 1))

        self.fillPB_bis = mod.get_function("fillPB_bis")
        self.fillPB_bis.prepare(["P", "P", 'P', "P", 'P', 'i', 'P', 'i'],
                                block=(1, 1, 1))

        self.badFillPB_0 = mod.get_function("badFillPB_0")
        self.badFillPB_0.prepare(
            ["P", "P", 'P', "P", 'P', 'P', 'P', 'P', 'P', "P", 'i', 'i', 'd'],
            block=(1, 1, 1))

        self.resetPB_type0 = mod.get_function("resetPB_type0")
        self.resetPB_type0.prepare(['P', "P", 'P', 'P', 'i'], block=(1, 1, 1))

        dtype = numpy.dtype(numpy.float64)
        int_type = numpy.dtype(numpy.int32)

        self.P_gpu = cuda.mem_alloc(self.P.size * dtype.itemsize)
        self.B_gpu = cuda.mem_alloc(self.B.size * dtype.itemsize)

        GPU_data_int = zeros((self.neuron.BPcount, 4))
        GPU_data_double = zeros((self.neuron.BPcount, 3))
        GPU_data_int[:, 0] = self.index_ante_list[:]
        GPU_data_int[:, 1] = self.i_list[:]
        GPU_data_int[:, 2] = self.j_list[:]
        GPU_data_int[:, 3] = self.bp_list[:]
        GPU_data_double[:, 0] = self.test_list[:]
        GPU_data_double[:, 1] = self.Aplus_i[:]
        GPU_data_double[:, 2] = self.Aminus_bp[:]
        self.GPU_data_int = cuda.mem_alloc(4 * self.neuron.BPcount *
                                           int_type.itemsize)
        self.GPU_data_double = cuda.mem_alloc(3 * self.neuron.BPcount *
                                              dtype.itemsize)
        cuda.memcpy_htod(self.GPU_data_int, GPU_data_int.astype(numpy.int32))
        cuda.memcpy_htod(self.GPU_data_double,
                         GPU_data_double.astype(numpy.float64))

        self.ind0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
        cuda.memcpy_htod(self.ind0_gpu, numpy.array(self.ind0, numpy.int32))

        self.ind1_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
        cuda.memcpy_htod(self.ind1_gpu, numpy.array(self.ind1, numpy.int32))

        self.ind2_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
        cuda.memcpy_htod(self.ind2_gpu, numpy.array(self.ind2, numpy.int32))

        self.ind_bctype_0_gpu = cuda.mem_alloc(self.neuron.BPcount *
                                               int_type.itemsize)
        cuda.memcpy_htod(self.ind_bctype_0_gpu,
                         numpy.array(self.ind_bctype_0, numpy.int32))

        self.ab1_base_gpu = cuda.mem_alloc(n * dtype.itemsize)

        self.ab1_gpu = cuda.mem_alloc(3 * n * dtype.itemsize)
        self.ab1_gpu_ptr = int(self.ab1_gpu)

        self.Cm_gpu = cuda.mem_alloc(n * dtype.itemsize)
        cuda.memcpy_htod(self.Cm_gpu, self.neuron.Cm.astype(numpy.float64))

        self.gtot_gpu = cuda.mem_alloc(n * dtype.itemsize)

        self.I0_gpu = cuda.mem_alloc(n * dtype.itemsize)

        self.v_gpu = cuda.mem_alloc(n * dtype.itemsize)
        cuda.memcpy_htod(self.v_gpu, self.neuron.v)

        ab0 = zeros(3 * n).astype(numpy.float64)
        ab0[:n] = self.ab0[:]
        ab0[n:2 * n] = self.ab0[:]
        ab0[2 * n:3 * n] = self.ab0[:]

        ab2 = zeros(3 * n).astype(numpy.float64)
        ab2[:n] = self.ab2[:]
        ab2[n:2 * n] = self.ab2[:]
        ab2[2 * n:3 * n] = self.ab2[:]

        dtype = numpy.dtype(numpy.float64)

        self.ab0_gpu = cuda.mem_alloc(ab0.size * dtype.itemsize)
        self.ab0_gpu_ptr = int(self.ab0_gpu)
        self.ab2_gpu = cuda.mem_alloc(ab2.size * dtype.itemsize)
        self.ab2_gpu_ptr = int(self.ab2_gpu)

        self.bL_gpu = cuda.mem_alloc(self.bL.size * dtype.itemsize)
        self.bL_gpu_ptr = int(self.bL_gpu)
        self.bR_gpu = cuda.mem_alloc(self.bR.size * dtype.itemsize)
        self.bR_gpu_ptr = int(self.bR_gpu)

        self.b_gpu = cuda.mem_alloc(3 * self.bR.size * dtype.itemsize)
        self.b_gpu_ptr = int(self.b_gpu)  # bd + bL + bR -> vd + vL + vR

        cuda.memcpy_htod(self.ab0_gpu, ab0)
        cuda.memcpy_htod(self.ab2_gpu, ab2)
        cuda.memcpy_htod(self.bL_gpu, self.bL)
        cuda.memcpy_htod(self.bR_gpu, self.bR)

        self.ante_gpu = cuda.mem_alloc(self.ante_arr.size *
                                       self.ante_arr.dtype.itemsize)
        self.post_gpu = cuda.mem_alloc(self.ante_arr.size *
                                       self.ante_arr.dtype.itemsize)
        self.v_old_gpu = cuda.mem_alloc(self.neuron.v.size * dtype.itemsize)

        cuda.memcpy_htod(self.ante_gpu, self.ante_arr)
        cuda.memcpy_htod(self.post_gpu, self.post_arr)

        #----------------------------------------------------------------------------------

        self.v_branchpoints = zeros(self.neuron.BPcount)
        self.v_bp_gpu = cuda.mem_alloc(self.v_branchpoints.size *
                                       dtype.itemsize)

        self.timeDevice = [0]
        self.timeDeviceU = [0]
        self.timeDeviceT = [0]
        self.timeHost = [0]
        self.timeUpdater = [0]
        self.timeSolveHost = [0]
        self.timeFillHost = [0]
        self.timeFin = [0]
Пример #35
0
 def alloc_exchange_boundaries(s):
     s.ey_tmp = cuda.pagelocked_zeros((s.ny, s.nz), 'f')
     s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
     s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
     s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
Пример #36
0
	def run_simulation(self, weights, lengths, params_matrix, speeds, logger, args, n_nodes, n_work_items, n_params, nstep, n_inner_steps,
		buf_len, states, dt, min_speed):

		# setup data#{{{
		data = { 'weights': weights, 'lengths': lengths, 'params': params_matrix.T }
		base_shape = n_work_items,
		for name, shape in dict(
			tavg=(n_nodes,),
			state=(buf_len, states * n_nodes),
			).items():
			data[name] = np.zeros(shape + base_shape, 'f')

		gpu_data = self.make_gpu_data(data)#{{{
		# logger.info('history shape %r', data['state'].shape)
		logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))#}}}

		# setup CUDA stuff#{{{
		step_fn = self.make_kernel(
			source_file=args.filename,
			warp_size=32,
			block_dim_x=args.n_coupling,
			# ext_options=preproccesor_defines,
			# caching=args.caching,
			args=args,
			lineinfo=args.lineinfo,
			nh=buf_len,
			# model=args.model,
			)#}}}

		# setup simulation#{{{
		tic = time.time()
		# logger.info('nstep %i', nstep)
		streams = [drv.Stream() for i in range(32)]
		events = [drv.Event() for i in range(32)]
		tavg_unpinned = []
		tavg = drv.pagelocked_zeros(data['tavg'].shape, dtype=np.float32)
		# logger.info('data[tavg].shape %s', data['tavg'].shape)
		#}}}

		gridx = args.n_coupling // args.blockszx
		gridy = args.n_speed // args.blockszy
		final_block_dim = args.blockszx, args.blockszy, 1
		final_grid_dim = gridx, gridy

		# logger.info('final block dim %r', final_block_dim)
		logger.info('final grid dim %r', final_grid_dim)
		# assert n_coupling_per_block * n_coupling_blocks == args.n_coupling #}}}

		# logger.info('gpu_data[lengts] %s', gpu_data['lengths'].shape)
		# logger.info('nnodes %r', n_nodes)
		# logger.info('gpu_data[lengths] %r', gpu_data['lengths'])

		# run simulation#{{{
		# logger.info('submitting work')
		import tqdm
		for i in tqdm.trange(nstep):

			# event = events[i % 32]
			# stream = streams[i % 32]

			# stream.wait_for_event(events[(i - 1) % 32])

			step_fn(np.uintc(i * n_inner_steps), np.uintc(n_nodes), np.uintc(buf_len), np.uintc(n_inner_steps),
					np.uintc(n_params), np.float32(dt), np.float32(min_speed),
					gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'],
					gpu_data['tavg'],
					block=final_block_dim,
					grid=final_grid_dim)

			# event.record(streams[i % 32])
			tavg_unpinned.append(tavg.copy())
			drv.memcpy_dtoh(
				tavg,
				gpu_data['tavg'].ptr)

		# logger.info('kernel finish..')
		# release pinned memory
		tavg = np.array(tavg_unpinned)
		return tavg
Пример #37
0
 def __init__(self,
              source,
              b,
              a,
              samplerate=None,
              precision='double',
              forcesync=True,
              pagelocked_mem=True,
              unroll_filterorder=None):
     # Automatically duplicate mono input to fit the desired output shape
     if b.shape[0] != source.nchannels:
         if source.nchannels != 1:
             raise ValueError(
                 'Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.'
             )
         source = RestructureFilterbank(source, b.shape[0])
     Filterbank.__init__(self, source)
     if pycuda.context is None:
         set_gpu_device(0)
     self.precision = precision
     if self.precision == 'double':
         self.precision_dtype = float64
     else:
         self.precision_dtype = float32
     self.forcesync = forcesync
     self.pagelocked_mem = pagelocked_mem
     n, m, p = b.shape
     self.filt_b = b
     self.filt_a = a
     filt_b_gpu = array(b, dtype=self.precision_dtype)
     filt_a_gpu = array(a, dtype=self.precision_dtype)
     filt_state = zeros((n, m - 1, p), dtype=self.precision_dtype)
     if pagelocked_mem:
         filt_y = drv.pagelocked_zeros((n, ), dtype=self.precision_dtype)
         self.pre_x = drv.pagelocked_zeros((n, ),
                                           dtype=self.precision_dtype)
     else:
         filt_y = zeros(n, dtype=self.precision_dtype)
         self.pre_x = zeros(n, dtype=self.precision_dtype)
     self.filt_b_gpu = gpuarray.to_gpu(filt_b_gpu.T.flatten(
     ))  # transform to Fortran order for better GPU mem
     self.filt_a_gpu = gpuarray.to_gpu(
         filt_a_gpu.T.flatten())  # access speeds
     self.filt_state = gpuarray.to_gpu(filt_state.T.flatten())
     self.unroll_filterorder = unroll_filterorder
     if unroll_filterorder is None:
         if m <= 32:
             unroll_filterorder = True
         else:
             unroll_filterorder = False
     # TODO: improve code, check memory access patterns, maybe use local memory
     code = '''
     #define x(s,i) _x[(s)*n+(i)]
     #define y(s,i) _y[(s)*n+(i)]
     #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
     #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
     #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
     __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
     {
         int j = blockIdx.x * blockDim.x + threadIdx.x;
         if(j>=n) return;
         for(int s=0; s<numsamples; s++)
         {
     '''
     for k in range(p):
         loopcode = '''
         y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
         '''
         if unroll_filterorder:
             for i in range(m - 2):
                 loopcode += re.sub(
                     '\\bi\\b', str(i), '''
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
                 ''')
         else:
             loopcode += '''
             for(int i=0;i<m-2;i++)
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
             '''
         loopcode += '''
         zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
         '''
         if k < p - 1:
             loopcode += '''
             x(s,j) = y(s,j);
             '''
         loopcode = re.sub('\\bk\\b', str(k), loopcode)
         code += loopcode
     code += '''
         }
     }
     '''
     code = code.replace('SCALAR', self.precision)
     code = re.sub("\\bp\\b", str(p),
                   code)  #replace the variable by their values
     code = re.sub("\\bm\\b", str(m), code)
     code = re.sub("\\bn\\b", str(n), code)
     #print code
     self.gpu_mod = pycuda.compiler.SourceModule(code)
     self.gpu_filt_func = self.gpu_mod.get_function("filt")
     blocksize = 256
     if n < blocksize:
         blocksize = n
     if n % blocksize == 0:
         gridsize = n / blocksize
     else:
         gridsize = n / blocksize + 1
     self.block = (blocksize, 1, 1)
     self.grid = (gridsize, 1)
     self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32),
                                self.block)
     self._has_run_once = False
Пример #38
0
if rank == 0:
    print "\navg: %1.2f GFLOPS" % flops[2:-2].mean()

if rank == 1:
    total = np.zeros(tmax)
    for key in exec_time.iterkeys():
        total[:] += exec_time[key][:]
    for key in exec_time.iterkeys():
        print key, ':\t %1.2f %%' % (exec_time[key][2:-2].sum() /
                                     total[2:-2].sum() * 100)

    print "%1.2f GFLOPS\r" % (
        (tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6)

g = cuda.pagelocked_zeros((nx, ny, nz), 'f')
cuda.memcpy_dtoh(g, ez_gpu)
if rank != 0:
    comm.Send(g, 0, 24)
else:
    lg = np.zeros((3 * nx, ny), 'f')
    lg[:nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 1, 24)
    lg[nx:-nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 2, 24)
    lg[2 * nx:, :] = g[:, :, nz / 2]
    imsh.set_array(lg.T**2)
    show()  #draw()
    #savefig('./png-wave/%.5d.png' % tstep)

    stop.record()
Пример #39
0
	def prepare(self):
		'''
		From Hines 1984 paper, discrete formula is:
		A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i)
       
		A_plus: i->i+1
		A_minus: i->i-1
		
        This gives the following tridiagonal system:
        A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i)
        
        Boundaries, one simple possibility (sealed ends):
        -(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n)
        A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0)
        '''
		mid_diameter = zeros(len(self.neuron)) # mid(i) : (i-1) <-> i
		mid_diameter[1:] = .5*(self.neuron.diameter[:-1]+self.neuron.diameter[1:])
		
		self.Aplus = zeros(len(self.neuron)) # A+ i -> j = Aplus(j)
		self.Aminus = zeros(len(self.neuron)) # A- i <- j = Aminus(j)
		self.Aplus[1]= mid_diameter[1]**2/(4*self.neuron.diameter[1]*self.neuron.length[1]**2*self.neuron.Ri)
		self.Aplus[2:]=mid_diameter[2:]**2/(4*self.neuron.diameter[1:-1]*self.neuron.length[1:-1]**2*self.neuron.Ri)
		self.Aminus[1:]=mid_diameter[1:]**2/(4*self.neuron.diameter[1:]*self.neuron.length[1:]**2*self.neuron.Ri) 
		
		self.neuron.index = zeros(len(self.neuron),int) # gives the index of the branch containing the current compartment
		self.neuron.branches = [] # (i,j,bp,ante,ante_index,pointType)
		# i is the first compartment
		# bp is the last, a branch point
		# j is the end of the "inner branch". j = bp-1
		# ante is the branch point to which i is connected
		
		self.neuron.BPcount = 0 # number of branch points (or branches). = len(self.neuron.branches)
		self.neuron.long_branches_count = 0 # number of branches with len(branch) > 1
		
		#self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		
		self.bL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.bR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.ab = zeros((3,len(self.neuron)))
		self.ab0 = zeros(len(self.neuron))
		self.ab1 = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.ab2 = zeros(len(self.neuron))
		self.ab1_base = zeros(len(self.neuron))
		#self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64)
		
		self.mTrunc = 0 # used to truncate vL and vR
		self.delta_list = zeros(len(self.neuron)) #used to find mTrunc
		
		# prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus
		self.prepare_branch(self.neuron.morphology, mid_diameter,0)
		
		# linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account.
		self.P = zeros((self.neuron.BPcount,self.neuron.BPcount))
		self.B = zeros(self.neuron.BPcount)
		self.solution_bp = zeros(self.neuron.BPcount)
		
		self.gtot = zeros(len(self.neuron))
		self.I0 = zeros(len(self.neuron))
		self.i_list = []
		self.j_list = []
		self.i_list_bis = []
		self.j_list_bis = []
		new_tridiag = True
		self.bp_list = []
		self.pointType_list = []
		self.pointTypeAnte_list = []
		self.index_ante_list0 = []
		self.index_ante_list1 = []
		self.index_ante_list2 = []
		self.ante_list = []
		self.post_list = []
		self.ante_list_idx = []
		self.post_list_idx = []
		self.id = []
		self.test_list = []
		temp = zeros(self.neuron.BPcount)
		self.ind0 = []
		self.ind_bctype_0 = []
		for index,(i,j,bp,ante,index_ante,pointType) in enumerate(self.neuron.branches) :
			self.i_list.append(i)
			self.j_list.append(j)
			if new_tridiag:
				self.i_list_bis.append(i)
				ii = i
			else:
				ii = self.i_list[-1]
			if j-ii+1>2:
				self.j_list_bis.append(j)
				new_tridiag = True
			else :
				new_tridiag = False
			self.bp_list.append(bp)
			self.pointType_list.append(max(1,pointType))
			self.pointTypeAnte_list.append(max(1,self.neuron.bc[ante]))
			temp[index] = index_ante
			self.id.append(index)
			if (j-i+2>1):
				self.test_list.append(1)
			else :
				self.test_list.append(0)
			for x in xrange(j-i+2):
				self.ante_list.append(ante)
				self.post_list.append(bp)
				self.ante_list_idx.append(index_ante)
				self.post_list_idx.append(index)
			if index_ante == 0 and index != 0:
				self.ind0.append(index)
			if pointType==0 :
				self.ind_bctype_0.append(bp)
		
		self.ante_arr = numpy.array(self.ante_list_idx)
		self.post_arr = numpy.array(self.post_list_idx)
		
		self.index_ante_list1, self.ind1 = numpy.unique(temp,return_index=True)
		self.ind1 = numpy.sort(self.ind1)
		self.index_ante_list1 = temp[self.ind1]
		self.index_ante_list1 = list(self.index_ante_list1)
		self.ind2 = []
		for x in xrange(self.neuron.BPcount):
			self.ind2.append(x)
		self.ind2 = numpy.delete(self.ind2,self.ind1,None) 
		self.ind2 = numpy.setdiff1d(self.ind2, self.ind0, assume_unique=True)
		self.index_ante_list2 = temp[self.ind2]
		self.index_ante_list2 = list(self.index_ante_list2)
		
		self.index_ante_list = list(temp)
		self.Aminus_bp = self.Aminus[self.bp_list]
		self.Aminus_bp [:] *= self.pointType_list[:]
		self.Aplus_i = self.Aplus[self.i_list]
		self.Aplus_i[:] *= self.pointTypeAnte_list[:]
		#--------------------------------------------------------GPU------------------------
		n = len(self.neuron)
		
		mod = SourceModule("""
        __global__ void updateAB_gtot(double *ab1, double *ab1_base, double *gtot)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          
          ab1[idx] = ab1_base[idx] - gtot[idx];
          ab1[idx+gridDim.x] = ab1_base[idx] - gtot[idx];
          ab1[idx+2*gridDim.x] = ab1_base[idx] - gtot[idx];
        }
        
        __global__ void updateBD(double *bd, double *Cm, double dt,double *v, double *I0)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          
          bd[idx] = - Cm[idx] / dt * v[idx] - I0[idx];
        }
        
        __global__ void finalizeFun(double *v, double *v_bp, int *ante,int *post, double *b, int m)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int idx_a = ante[idx];
          int idx_p = post[idx];
          
          v[idx] = b[idx + m] * v_bp[idx_a] + b[idx + 2*m] * v_bp[idx_p] + b[idx]; // vL, vR, d
        }
        
        __global__ void finalizeFunBis(double *v, double *v_bp, int *GPU_data_int)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int bp = GPU_data_int[4 * idx + 3]; 
          v[bp] = v_bp[idx];
        }
        
        __global__ void initPB(double *P, double *B, int BPcount)
        {
        	int idx = threadIdx.x + blockIdx.x * blockDim.x;
        	int idy = threadIdx.x + blockIdx.y * blockDim.y;
        	
        	P[idx + idy * BPcount] = 0.0;
        	B[idx] = 0.0;
        }
        
        //GPU_data_int is : ante_list, i_list, j_list, bp_list
        //GPU_data_double is : test_list, Aplus, Aminus
        __global__ void fillPB(double *P, double *B, double *b, double *Cm_l, double *gtot_l, int *GPU_data_int,
                        double *GPU_data_double, double *I0_l, double *v_l, int BPcount, double dt, int m)
        { 
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          int idx_ante = GPU_data_int[4 * idx];
          int i = GPU_data_int[4 * idx + 1];
          int j = GPU_data_int[4 * idx + 2];
          int bp = GPU_data_int[4 * idx + 3];
          double test = GPU_data_double[3 * idx];
          double Aplus = GPU_data_double[3 * idx + 1];
          double Aminus = GPU_data_double[3 * idx + 2];
          double Cm = Cm_l[bp];
          double gtot = gtot_l[bp];
          double I0 = I0_l[bp];
          double v_bp = v_l[bp];
          double vLright = b[j + m] * test;
          double vRleft = b[i + 2*m] * test;
          double vRright = b[j + 2*m] * test;
          double dright = b[j] * test;
          
          B[idx] += -Cm/dt * v_bp - I0 -Aminus * dright;
          P[idx * BPcount + idx] += -Cm/dt - gtot + Aminus * (vRright - 1.0);
          P[idx * BPcount + idx_ante] += Aminus * vLright;
          P[idx_ante * BPcount + idx] += Aplus * vRleft;
        }
        
        __global__ void fillPB_bis(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double,
                        int BPcount, int *indices, int m)
        { 
          int idx_temp = threadIdx.x + blockIdx.x * blockDim.x;
          int idx = indices[idx_temp];
          int idx_ante = GPU_data_int[4 * idx];
          int i = GPU_data_int[4 * idx + 1];
          int test = GPU_data_double[3 * idx];
          double Aplus = GPU_data_double[3 * idx + 1];
          double vLleft = b[i + m] * test;
          double dleft = b[i] * test;
          
          B[idx_ante] += - Aplus * dleft;
          P[idx_ante * (BPcount + 1)] += Aplus * (vLleft - 1.0);
        }
        
        __global__ void badFillPB_0(double *P, double *B, double *b, int *GPU_data_int, double *GPU_data_double,
                        int *indices, double *Cm_l, double *gtot_l, double *I0_l, double *v_l, int len_indices, int m, double dt)
        { 
          double Cm = Cm_l[0];
          double gtot = gtot_l[0];
          double I0 = I0_l[0];
          double v_0 = v_l[0];
          
          B[0] = - Cm/dt * v_0 - I0;
		  P[0] = - Cm/dt - gtot;
		  
		  int idx;
		  int i;
		  int test;
		  double Aplus;
		  double vLleft;
		  double dleft;
          for (int idx_temp=0;idx_temp<len_indices;idx_temp++)
          {
          	idx = indices[idx_temp];
          	i = GPU_data_int[4 * idx + 1];
          	test = GPU_data_double[3 * idx];
          	Aplus = GPU_data_double[3 * idx + 1];
          	vLleft = b[i + m] * test;
          	dleft = b[i] * test;
          
			P[0] += Aplus * (vLleft - 1.0);
			B[0] += - Aplus * dleft;
		  }
        }
        
        __global__ void resetPB_type0(double *P, double *B, double *v, int *indices,int BPcount)
        {
        	int idx = indices[threadIdx.x] + blockIdx.x * blockDim.x;
        	int idy = threadIdx.x + blockIdx.y * blockDim.y;
        	
        	P[idx + idy * BPcount] = 0.0;
        	P[idx + idx * BPcount] = 1.0;
        	B[idx] = v[idx];
        }
        """)
		
		
		self.updateAB_gtot = mod.get_function("updateAB_gtot")
		self.updateAB_gtot.prepare(["P","P",'P'],block=(1,1,1))
		
		self.updateBD = mod.get_function("updateBD")
		self.updateBD.prepare(["P","P",'d','P','P'],block=(1,1,1))
		
		self.finalizeFun = mod.get_function("finalizeFun")
		self.finalizeFun.prepare(['P','P','P','P','P','i'],block=(1,1,1))
		
		self.finalizeFunBis = mod.get_function("finalizeFunBis")
		self.finalizeFunBis.prepare(['P','P','P'],block=(1,1,1))
		
		self.initPB = mod.get_function("initPB")
		self.initPB.prepare(['P',"P",'i'],block=(1,1,1))
		
		self.fillPB = mod.get_function("fillPB")
		self.fillPB.prepare(["P","P",'P',"P",'P','P','P','P','P','i','d','i'],block=(1,1,1))
		
		self.fillPB_bis = mod.get_function("fillPB_bis")
		self.fillPB_bis.prepare(["P","P",'P',"P",'P','i','P','i'],block=(1,1,1))
		
		self.badFillPB_0 = mod.get_function("badFillPB_0")
		self.badFillPB_0.prepare(["P","P",'P',"P",'P','P','P','P','P',"P",'i','i','d'],block=(1,1,1))
		
		self.resetPB_type0 = mod.get_function("resetPB_type0")
		self.resetPB_type0.prepare(['P',"P",'P','P','i'],block=(1,1,1))
		
		dtype = numpy.dtype(numpy.float64)
		int_type = numpy.dtype(numpy.int32)
		
		self.P_gpu = cuda.mem_alloc(self.P.size * dtype.itemsize)
		self.B_gpu = cuda.mem_alloc(self.B.size * dtype.itemsize)
		
		GPU_data_int = zeros((self.neuron.BPcount,4))
		GPU_data_double = zeros((self.neuron.BPcount,3))
		GPU_data_int[:,0] = self.index_ante_list[:]
		GPU_data_int[:,1] = self.i_list[:]
		GPU_data_int[:,2] = self.j_list[:]
		GPU_data_int[:,3] = self.bp_list[:]
		GPU_data_double[:,0] = self.test_list[:]
		GPU_data_double[:,1] = self.Aplus_i[:]
		GPU_data_double[:,2] = self.Aminus_bp[:]
		self.GPU_data_int = cuda.mem_alloc(4 * self.neuron.BPcount * int_type.itemsize)
		self.GPU_data_double = cuda.mem_alloc(3 * self.neuron.BPcount * dtype.itemsize)
		cuda.memcpy_htod(self.GPU_data_int,GPU_data_int.astype(numpy.int32))
		cuda.memcpy_htod(self.GPU_data_double,GPU_data_double.astype(numpy.float64))
		
		self.ind0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
		cuda.memcpy_htod(self.ind0_gpu,numpy.array(self.ind0,numpy.int32))
		
		self.ind1_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
		cuda.memcpy_htod(self.ind1_gpu,numpy.array(self.ind1,numpy.int32))
		
		self.ind2_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
		cuda.memcpy_htod(self.ind2_gpu,numpy.array(self.ind2,numpy.int32))
		
		self.ind_bctype_0_gpu = cuda.mem_alloc(self.neuron.BPcount * int_type.itemsize)
		cuda.memcpy_htod(self.ind_bctype_0_gpu,numpy.array(self.ind_bctype_0,numpy.int32))
		
		self.ab1_base_gpu =  cuda.mem_alloc(n * dtype.itemsize)
		
		self.ab1_gpu =  cuda.mem_alloc(3 * n * dtype.itemsize)
		self.ab1_gpu_ptr = int(self.ab1_gpu)
		
		self.Cm_gpu =  cuda.mem_alloc(n * dtype.itemsize)
		cuda.memcpy_htod(self.Cm_gpu,self.neuron.Cm.astype(numpy.float64))
		
		self.gtot_gpu =  cuda.mem_alloc(n * dtype.itemsize)
		
		self.I0_gpu = cuda.mem_alloc(n * dtype.itemsize)
		
		self.v_gpu = cuda.mem_alloc(n * dtype.itemsize)
		cuda.memcpy_htod(self.v_gpu,self.neuron.v)
		
		ab0 = zeros(3*n).astype(numpy.float64)
		ab0[:n] = self.ab0[:]
		ab0[n:2*n] = self.ab0[:]
		ab0[2*n:3*n] = self.ab0[:]
		
		ab2 = zeros(3*n).astype(numpy.float64)
		ab2[:n] = self.ab2[:]
		ab2[n:2*n] = self.ab2[:]
		ab2[2*n:3*n] = self.ab2[:]
		
		dtype = numpy.dtype(numpy.float64)
		
		self.ab0_gpu =  cuda.mem_alloc(ab0.size * dtype.itemsize)
		self.ab0_gpu_ptr = int(self.ab0_gpu)
		self.ab2_gpu =  cuda.mem_alloc(ab2.size * dtype.itemsize)
		self.ab2_gpu_ptr = int(self.ab2_gpu)
		
		self.bL_gpu =  cuda.mem_alloc(self.bL.size * dtype.itemsize)
		self.bL_gpu_ptr = int(self.bL_gpu)
		self.bR_gpu =  cuda.mem_alloc(self.bR.size * dtype.itemsize)
		self.bR_gpu_ptr = int(self.bR_gpu)
		
		self.b_gpu =  cuda.mem_alloc(3 * self.bR.size * dtype.itemsize)
		self.b_gpu_ptr = int(self.b_gpu) # bd + bL + bR -> vd + vL + vR
		
		cuda.memcpy_htod(self.ab0_gpu, ab0)
		cuda.memcpy_htod(self.ab2_gpu, ab2)
		cuda.memcpy_htod(self.bL_gpu, self.bL)
		cuda.memcpy_htod(self.bR_gpu, self.bR)
		
		self.ante_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize)
		self.post_gpu = cuda.mem_alloc(self.ante_arr.size * self.ante_arr.dtype.itemsize)
		self.v_old_gpu = cuda.mem_alloc(self.neuron.v.size * dtype.itemsize)
		
		cuda.memcpy_htod(self.ante_gpu,self.ante_arr)
		cuda.memcpy_htod(self.post_gpu,self.post_arr)
		
		
		#----------------------------------------------------------------------------------
		
		self.v_branchpoints = zeros(self.neuron.BPcount)
		self.v_bp_gpu = cuda.mem_alloc(self.v_branchpoints.size * dtype.itemsize)
		
		self.timeDevice = [0]
		self.timeDeviceU = [0]
		self.timeDeviceT = [0]
		self.timeHost = [0]
		self.timeUpdater = [0]
		self.timeSolveHost = [0]
		self.timeFillHost = [0]
		self.timeFin = [0]
        hx_gpu = cuda.to_device(f)
        hy_gpu = cuda.to_device(f)
        hz_gpu = cuda.to_device(f)

        cex_gpu = cuda.to_device(set_c(f, (None, -1, -1)))
        cey_gpu = cuda.to_device(set_c(f, (-1, None, -1)))
        cez_gpu = cuda.to_device(set_c(f, (-1, -1, None)))
        chx_gpu = cuda.to_device(set_c(f, (None, 0, 0)))
        chy_gpu = cuda.to_device(set_c(f, (0, None, 0)))
        chz_gpu = cuda.to_device(set_c(f, (0, 0, None)))

        # pinned memory allocation for zero-copy
        if myrank != 1:
            ex_send = cuda.pagelocked_zeros(
                (nx, ny),
                np.float32,
                order='F',
                mem_flags=cuda.host_alloc_flags.DEVICEMAP)
            ey_send = cuda.pagelocked_zeros(
                (nx, ny),
                np.float32,
                order='F',
                mem_flags=cuda.host_alloc_flags.DEVICEMAP)
            hx_recv = cuda.pagelocked_zeros(
                (nx, ny),
                np.float32,
                order='F',
                mem_flags=cuda.host_alloc_flags.DEVICEMAP)
            hy_recv = cuda.pagelocked_zeros(
                (nx, ny),
                np.float32,
Пример #41
0
 def alloc_async_host_buf(self, shape, dtype):
     """Allocates a buffer that can be used for asynchronous data
     transfers."""
     return cuda.pagelocked_zeros(shape, dtype=dtype)
Пример #42
0
from wavemoth.cuda.profile import cuda_profile

N = 4092

nside = 2048
npix = 12 * 2048**2
nrings = 4 * nside - 1
lmax = 2 * nside

#x = np.asarray(np.random.rand(N), np.float32)
#xf = np.fft.fft(x)
#x_gpu = gpuarray.to_gpu(x)
#xf_gpu = gpuarray.empty(N/2+1, np.complex64)

map = drv.pagelocked_zeros(npix, np.float64)
buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128)

map_gpu = drv.mem_alloc(npix * 8)
buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16)

drv.memcpy_htod(map_gpu, map)

from wavemoth.cuda import cufft

print 'ctoring plan'
plan = cufft.HealpixCuFFTPlan(2048, 8)

repeats = 1
print 'plan ctored'
with cuda_profile() as prof:
Пример #43
0
 def __init__(self, source, b, a, samplerate=None,
              precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None):
     # Automatically duplicate mono input to fit the desired output shape
     if b.shape[0]!=source.nchannels:
         if source.nchannels!=1:
             raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.')
         source = RestructureFilterbank(source, b.shape[0])
     Filterbank.__init__(self, source)
     if pycuda.context is None:
         set_gpu_device(0)
     self.precision=precision
     if self.precision=='double':
         self.precision_dtype=float64
     else:
         self.precision_dtype=float32
     self.forcesync=forcesync
     self.pagelocked_mem=pagelocked_mem
     n, m, p=b.shape
     self.filt_b=b
     self.filt_a=a
     filt_b_gpu=array(b, dtype=self.precision_dtype)
     filt_a_gpu=array(a, dtype=self.precision_dtype)
     filt_state=zeros((n, m-1, p), dtype=self.precision_dtype)
     if pagelocked_mem:
         filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
         self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
     else:
         filt_y=zeros(n, dtype=self.precision_dtype)
         self.pre_x=zeros(n, dtype=self.precision_dtype)
     self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem
     self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds
     self.filt_state=gpuarray.to_gpu(filt_state.T.flatten())
     self.unroll_filterorder = unroll_filterorder
     if unroll_filterorder is None:
         if m<=32:
             unroll_filterorder = True
         else:
             unroll_filterorder = False
     # TODO: improve code, check memory access patterns, maybe use local memory
     code='''
     #define x(s,i) _x[(s)*n+(i)]
     #define y(s,i) _y[(s)*n+(i)]
     #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
     #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
     #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
     __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
     {
         int j = blockIdx.x * blockDim.x + threadIdx.x;
         if(j>=n) return;
         for(int s=0; s<numsamples; s++)
         {
     '''
     for k in range(p):
         loopcode='''
         y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
         '''
         if unroll_filterorder:
             for i in range(m-2):
                 loopcode+=re.sub('\\bi\\b', str(i), '''
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
                 ''')
         else:
             loopcode+='''
             for(int i=0;i<m-2;i++)
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
             '''
         loopcode+='''
         zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
         '''
         if k<p-1:
             loopcode+='''
             x(s,j) = y(s,j);
             '''
         loopcode=re.sub('\\bk\\b', str(k), loopcode)
         code+=loopcode
     code+='''
         }
     }
     '''
     code=code.replace('SCALAR', self.precision)
     code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values
     code=re.sub("\\bm\\b", str(m), code)
     code=re.sub("\\bn\\b", str(n), code)
     #print code
     self.gpu_mod=pycuda.compiler.SourceModule(code)
     self.gpu_filt_func=self.gpu_mod.get_function("filt")
     blocksize=256
     if n<blocksize:
         blocksize=n
     if n%blocksize==0:
         gridsize=n/blocksize
     else:
         gridsize=n/blocksize+1
     self.block=(blocksize, 1, 1)
     self.grid=(gridsize, 1)
     self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block)
     self._has_run_once=False
Пример #44
0
        sys.stdout.flush()
        start.record()

if rank == 0:
    print "\navg: %1.2f GFLOPS" % flops[2:-2].mean()

if rank == 1:
    total = np.zeros(tmax)
    for key in exec_time.iterkeys():
        total[:] += exec_time[key][:]
    for key in exec_time.iterkeys():
        print key, ":\t %1.2f %%" % (exec_time[key][2:-2].sum() / total[2:-2].sum() * 100)

    print "%1.2f GFLOPS\r" % ((tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6)

g = cuda.pagelocked_zeros((nx, ny, nz), "f")
cuda.memcpy_dtoh(g, ez_gpu)
if rank != 0:
    comm.Send(g, 0, 24)
else:
    lg = np.zeros((3 * nx, ny), "f")
    lg[:nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 1, 24)
    lg[nx:-nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 2, 24)
    lg[2 * nx :, :] = g[:, :, nz / 2]
    imsh.set_array(lg.T ** 2)
    show()  # draw()
    # savefig('./png-wave/%.5d.png' % tstep)

    stop.record()
Пример #45
0
__global__ void increment(int* a,float* progress)
{	
	for(int i=0;i<500;i++){
		atomicAdd(progress,1.0f);
	}
}

""").get_function("increment")
print("Compiled and got function increment")

def In(thing):
	thing_pointer = cuda.mem_alloc(thing.nbytes)
	cuda.memcpy_htod(thing_pointer, thing)
	return thing_pointer

pagelocked_mem = cuda.pagelocked_zeros((1,1),numpy.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
pagelocked_mem_ptr = numpy.intp(pagelocked_mem.base.get_device_pointer())
print(pagelocked_mem[0,0])

a = numpy.int32(345)
a_gpu = In(a)


increment(a_gpu,pagelocked_mem_ptr, block=(1,1,1), grid=(50,50,1))

while pagelocked_mem[0,0]<(50*50*500):
	print pagelocked_mem[0,0]

cuda.Context.synchronize()

print pagelocked_mem[0,0]
Пример #46
0
    def getRT(self,
              s_map,
              srt_gpu,
              srt_nsamp,
              srt_npairs,
              npairs,
              store_rt=False):
        """
        Computes the rank template

        s_map(Sample Map) -  an list of 1s and 0s of length nsamples where 1 means use this sample
            to compute rank template
        srt_gpu - cuda memory object containing srt(sample rank template) array on gpu
        srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object
        npairs - true number of gene pairs being compared
        b_size - size of the blocks for computation
        store_rt - determines the RETURN value
            False(default) = returns an numpy array shape(npairs) of the rank template
            True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded)
        """

        b_size = self.b_size
        s_map_buff = self.s_map_buff = cuda.pagelocked_zeros(
            (int(srt_nsamp), ),
            np.int32,
            mem_flags=cuda.host_alloc_flags.DEVICEMAP)

        s_map_buff[:len(s_map)] = np.array(s_map, dtype=np.int32)

        s_map_gpu = np.intp(s_map_buff.base.get_device_pointer())
        #cuda.memcpy_htod(s_map_gpu, s_map_buff)

        #sample blocks
        g_y_sz = self.getGrid(srt_nsamp)
        #pair blocks
        g_x_sz = self.getGrid(srt_npairs)

        block_rt_gpu = cuda.mem_alloc(
            int(g_y_sz * srt_npairs * (np.uint32(1).nbytes)))

        grid = (g_x_sz, g_y_sz)

        func1, func2 = self.getrtKern(g_y_sz)

        shared_size = b_size * b_size * np.uint32(1).nbytes

        func1(srt_gpu,
              np.uint32(srt_nsamp),
              np.uint32(srt_npairs),
              s_map_gpu,
              block_rt_gpu,
              np.uint32(g_y_sz),
              block=(b_size, b_size, 1),
              grid=grid,
              shared=shared_size)

        rt_buffer = self.rt_buffer = cuda.pagelocked_zeros(
            (int(srt_npairs), ),
            np.int32,
            mem_flags=cuda.host_alloc_flags.DEVICEMAP)
        rt_gpu = np.intp(rt_buffer.base.get_device_pointer())

        func2(block_rt_gpu,
              rt_gpu,
              np.int32(s_map_buff.sum()),
              block=(b_size, 1, 1),
              grid=(g_x_sz, ))

        if store_rt:
            #this is in case we want to run further stuff without
            #transferring back and forth
            return (rt_gpu, srt_npairs)
        else:
            #rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32)
            #cuda.memcpy_dtoh(rt_buffer, rt_gpu)
            #rt_gpu.free()
            return rt_buffer[:npairs]
	def alloc_exchange_boundaries(s):
		s.ey_tmp = cuda.pagelocked_zeros((s.ny,s.nz),'f')
		s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
		s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
		s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
Пример #48
0
    include_dirs=[
        "/usr/local/cuda/include",
    ],  # This because we use mma.h which isn't included in pycuda's default include dir.
    no_extern_c=True)  #An explanation of this follows.
# PyCuda normally wraps the entirety of its included source in an extern "C" {...} block. This is to prevent the kernels from being compiled in a C++
# fashion, and allows the module.get_function() [see below] to actually find the function by its name (i.e. identifier).
# I need to use this option because if #include <mma.h> is inside the extern "C" block, then PyCuda kicks puppies and clubs baby seals.
# You need to manually put this block around your kernel for the functionality to work.
# NOTE: I am not sure whether this prevents us from using any advanced, C++-like features (such as classes, inheritance, polymorphism, etc) in kernels.
# Probably we won't need those things anyway.
simple_tc_matmul_kernel = module.get_function(
    "simple_tc_matmul"
)  # This looks for the "simple_tc_matmul" identifier in the output of nvcc.

# Set up the A matrix. Currently, it is mostly zeros.
a_host = cuda.pagelocked_zeros((tcm_size, tcm_size), dtype=np.float16)
# Manually insert some test data.
a_host[0, 0] = np.float16(num1)
#a_host[0,2] = np.float16(num2)
a_device = cuda.mem_alloc(a_host.nbytes)
cuda.memcpy_htod(a_device, a_host)

print("A:")
print_mat(a_host)

# B will be mostly zeros.
b_host = cuda.pagelocked_zeros((tcm_size, tcm_size), dtype=np.float16)
# Manually insert some test data.
b_host[0, 0] = np.float16(num2)
#b_host[2,0] = np.float16(num1)
b_device = cuda.mem_alloc(a_host.nbytes)
Пример #49
0
    BGRA2NV12 = get_BGRA2NV12()
    print("BGRA2NV12=%s" % BGRA2NV12)

    w = roundup(512, 32)
    h = roundup(512, 32)

    log("w=%s, h=%s", w, h)

    cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h*3/2, 16)
    log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch)
    #allocate CUDA NV12 buffer (on device):
    cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h*3/2, 16)
    log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch)

    #host buffers:
    inputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte)
    log("inputBuffer=%s", inputBuffer)

    outputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte)
    log("outputBuffer=%s", outputBuffer)

    #populate host buffer with random data:
    buf = inputBuffer.data
    for y in range(h*3/2):
        dst = y * inputPitch
        #debug("%s: %s:%s (size=%s) <- %s:%s (size=%s)", y, dst, dst+w, len(buffer), src, src+w, len(Yplane))
        for x in range(w):
            buf[dst+x] = numpy.byte((x+y) % 256)

    #copy input buffer to CUDA buffer:
    driver.memcpy_htod(cudaInputBuffer, inputBuffer)
import pycuda.autoinit

# Setup the kernel
mod = SourceModule("""
__global__ void add(float *a, float *b, float *c, float *c_map) {
	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	float val;

	val = a[idx] + b[idx];
	c[idx] = val;
	c_map[idx] = val;
}
""")
add = mod.get_function("add")

# Memory allocation
nx = 1024
a = np.random.randn(nx).astype(np.float32)
b = np.random.randn(nx).astype(np.float32)
c = np.zeros_like(a)

a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)

# Page-locked host memory allocation for zero-copy
c_map = cuda.pagelocked_zeros(nx, np.float32)

add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) )
assert( np.linalg.norm( (a+b)-c ) == 0 )
assert( np.linalg.norm( (a+b)-c_map ) == 0 )