def get_kernel_function_info(a, W1=0, W2=1, W3=1): """Show kernel information Including 1. max #threads per block, 2. active warps per MP, 3. thread block per MP, 4. usage of shared memory, 5. const memory , 6. local memory 7. registers 8. hardware occupancy 9. limitation of the hardware occupancy """ import pycuda.tools as tl import pycuda.driver as dri dev = dri.Device(0) td = tl.DeviceData() if not W1: W1 = a.max_threads_per_block to = tl.OccupancyRecord(td, W1 * W2 * W3, a.shared_size_bytes, a.num_regs) print "***************************************" print " Function Info " print " -> max threads per block: %d / %d / %d" % \ (a.max_threads_per_block, dev.max_threads_per_block, dev.max_threads_per_multiprocessor) print " -> shared mem : %d / %d" % (a.shared_size_bytes, td.shared_memory) print " -> const mem : %d" % a.const_size_bytes print " -> local mem : %d" % a.local_size_bytes print " -> register : %d / %d" % (a.num_regs, td.registers) print " -> thread block per MP %d / %d" % \ (to.tb_per_mp, td.thread_blocks_per_mp) print " -> warps per MP %d / %d" % (to.warps_per_mp, td.warps_per_mp) print " -> occupancy %f" % to.occupancy print " -> limitation %s" % to.limited_by print " Block size : %dx%dx%d" % (W1, W2, W3) print "***************************************"
def run(self, parameters, initValues, timing=True, info=False, constant_sets=False, pairings=False): #NEEDS TO BE ADDED AGAIN ########################################################################## #check parameters and initValues for compability with pre-defined parameterNumber and speciesNumber #if(len(parameters[0]) != self._parameterNumber): # print "Error: Number of parameters specified (" + str(self._parameterNumber) + ") and given in parameter array (" + str(len(parameters[0])) + ") differ from each other!" # exit() #elif(len(initValues[0]) != self._speciesNumber): # print "Error: Number of species specified (" + str(self._speciesNumber) + ") and given in species array (" + str(len(initValues[0])) + ") differ from each other!" # exit() #elif(len(parameters) != len(initValues)): # print "Error: Number of sets of parameters (" + str(len(parameters)) + ") and species (" + str(len(initValues)) + ") do not match!" # exit() ########################################################################## #returnValue_final = [np.shape(parameters)[0],self._beta, self._resultNumber, self._speciesNumber] #returnValue_final = [np.zeros(returnValue_final) for x in self._stepCode] returnValue_final = [""] * len(self._cudafiles) if constant_sets == True: initValues_orig = initValues initValues_check = [x[0, :] for x in initValues_orig] parameters_orig = parameters total_time = 0.0 for count, cuda in enumerate(self._stepCode): if constant_sets == True: initValues_ind = pairings[self._cudafiles[count]] initValues = np.zeros( (np.shape(parameters)[0] * len(initValues_ind), self._speciesNumber)) for i, ICs in enumerate(initValues_ind): index_IC = [sum(ICs == x) for x in initValues_check ].index(self._speciesNumber) initValues[i * np.shape(parameters_orig)[0]:(i + 1) * np.shape(parameters_orig )[0], :] = initValues_orig[index_IC] parameters = np.concatenate( (parameters_orig, ) * len(initValues_ind), axis=0) #print parameters if (self._compiledRunMethod == None and self._runtimeCompile): #compile to determine blocks and threads self._completeCode, self._compiledRunMethod = self._compileAtRuntime( cuda, parameters) blocks, threads = self._getOptimalGPUParam(parameters) if info == True: print "cuda-sim: threads/blocks:", threads, blocks # real runtime compile #self._seedValue = seed #np.random.seed(self._seedValue) # make multiples of initValues initNew = np.zeros( (len(initValues) * self._beta, self._speciesNumber)) for i in range(len(initValues)): for j in range(self._beta): for k in range(self._speciesNumber): initNew[i * self._beta + j][k] = initValues[i][k] initValues = initNew if info == True: print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs occ = tools.OccupancyRecord( tools.DeviceData(), threads=threads, shared_mem=self._compiledRunMethod.shared_size_bytes, registers=self._compiledRunMethod.num_regs) print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy if timing: start = time.time() # number of device calls runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE))) for i in range(runs): # for last device call calculate number of remaining threads to run if (i == runs - 1): runblocks = int(blocks % self._MAXBLOCKSPERDEVICE) if (runblocks == 0): runblocks = self._MAXBLOCKSPERDEVICE else: runblocks = int(self._MAXBLOCKSPERDEVICE) if info == True: print "cuda-sim: Run", runblocks, "blocks." minIndex = self._MAXBLOCKSPERDEVICE * i * threads maxIndex = minIndex + threads * runblocks runParameters = parameters[minIndex / self._beta:maxIndex / self._beta] runInitValues = initValues[minIndex:maxIndex] #first run store return Value if (i == 0): returnValue = self._runSimulation(runParameters, runInitValues, runblocks, threads) else: returnValue = np.append(returnValue, self._runSimulation( runParameters, runInitValues, runblocks, threads), axis=0) if timing: print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round( (time.time() - start), 4), "s" total_time += time.time() - start if info: print "" returnValue_final[count] = returnValue print "cuda-sim: total running time:", round((total_time), 4), "s" if len(returnValue_final) == 1: return returnValue_final[0] else: return returnValue_final
def run(self): # obtain a CUDA context driver.init() if self._card < 0: self._context = tools.make_default_context() else: self._context = driver.Device(self._card).make_context() if self._info: print "cuda-sim: running on device ", self._card, self._context.get_device().name(), \ self._context.get_device().pci_bus_id() # hack for SDE code self._device = 0 # compile code self._completeCode, self._compiledRunMethod = self._compile( self._stepCode) blocks, threads = self._get_optimal_gpu_param() if self._info: print "cuda-sim: threads/blocks:", threads, blocks # make multiples of initValues incase beta > 1 init_new = np.zeros( (len(self._initValues) * self._beta, self._speciesNumber)) for i in range(len(self._initValues)): for j in range(self._beta): for k in range(self._speciesNumber): init_new[i * self._beta + j][k] = self._initValues[i][k] self._initValues = copy.deepcopy(init_new) if self._info: print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, \ self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs occ = tools.OccupancyRecord( tools.DeviceData(), threads=threads, shared_mem=self._compiledRunMethod.shared_size_bytes, registers=self._compiledRunMethod.num_regs) print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy if self._timing: start = time.time() # number of device calls runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE))) for i in range(runs): # for last device call calculate number of remaining threads to run if i == runs - 1: runblocks = int(blocks % self._MAXBLOCKSPERDEVICE) if runblocks == 0: runblocks = self._MAXBLOCKSPERDEVICE else: runblocks = int(self._MAXBLOCKSPERDEVICE) if self._info: print "cuda-sim: Run", runblocks, "blocks." min_index = self._MAXBLOCKSPERDEVICE * i * threads max_index = min_index + threads * runblocks run_parameters = self._parameters[min_index / self._beta:max_index / self._beta] run_init_values = self._initValues[min_index:max_index] # first run store return Value if i == 0: self._returnValue = self._run_simulation( run_parameters, run_init_values, runblocks, threads) else: self._returnValue = np.append( self._returnValue, self._run_simulation(run_parameters, run_init_values, runblocks, threads), axis=0) self.output_cpu.put([self._card, self._returnValue]) self.output_cpu.close() # if self._timing: # print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round((time.time()-start),4), "s" if self._info: print "" # return the context self._context.pop() del self._context return self._returnValue
def run(self, parameters, initValues, timing=True, info=False): #check parameters and initValues for compability with pre-defined parameterNumber and spieciesNumber if (len(parameters[0]) != self._parameterNumber): print "Error: Number of parameters specified (" + str( self. _parameterNumber) + ") and given in parameter array (" + str( len(parameters[0])) + ") differ from each other!" exit() elif (len(initValues[0]) != self._speciesNumber): print "Error: Number of species specified (" + str( self._speciesNumber) + ") and given in species array (" + str( len(initValues[0])) + ") differ from each other!" exit() elif (len(parameters) != len(initValues)): print "Error: Number of sets of parameters (" + str( len(parameters)) + ") and species (" + str( len(initValues)) + ") do not match!" exit() if (self._compiledRunMethod == None and self._runtimeCompile): #compile to determine blocks and threads self._completeCode, self._compiledRunMethod = self._compileAtRuntime( self._stepCode, parameters) blocks, threads = self._getOptimalGPUParam(parameters) if info == True: print "cuda-sim: threads/blocks:", threads, blocks # real runtime compile #self._seedValue = seed #np.random.seed(self._seedValue) # make multiples of initValues initNew = np.zeros((len(initValues) * self._beta, self._speciesNumber)) for i in range(len(initValues)): for j in range(self._beta): for k in range(self._speciesNumber): initNew[i * self._beta + j][k] = initValues[i][k] initValues = initNew if info == True: print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs occ = tools.OccupancyRecord( tools.DeviceData(), threads=threads, shared_mem=self._compiledRunMethod.shared_size_bytes, registers=self._compiledRunMethod.num_regs) print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy if timing: start = time.time() # number of device calls runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE))) for i in range(runs): # for last device call calculate number of remaining threads to run if (i == runs - 1): runblocks = int(blocks % self._MAXBLOCKSPERDEVICE) if (runblocks == 0): runblocks = self._MAXBLOCKSPERDEVICE else: runblocks = int(self._MAXBLOCKSPERDEVICE) if info == True: print "cuda-sim: Run", runblocks, "blocks." minIndex = self._MAXBLOCKSPERDEVICE * i * threads maxIndex = minIndex + threads * runblocks runParameters = parameters[minIndex / self._beta:maxIndex / self._beta] runInitValues = initValues[minIndex:maxIndex] #first run store return Value if (i == 0): returnValue = self._runSimulation(runParameters, runInitValues, runblocks, threads) else: returnValue = np.append(returnValue, self._runSimulation( runParameters, runInitValues, runblocks, threads), axis=0) if timing: print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round( (time.time() - start), 4), "s" if info: print "" return returnValue