def runComparison(arguments): print "Comparing parameters for (" +(str(arguments.name) if arguments.save else "")+ "): ( (" +str(arguments.pos[0])+","+str(arguments.pos[1])+"), ", print str(arguments.zoom)+", ("+str(arguments.dim[0])+", "+str(arguments.dim[1])+") )" cpuTime = call_utils.callCPU(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,save=arguments.save) try: results, cudaTime, retblock, retthread = call_utils.callCUDA(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter, block=arguments.blocks,thread=arguments.threads,save=arguments.save,mode=arguments.mode) except Exception, e: print e cudaTime = 'NA' pass
def runGeneration(arguments): if arguments.procCuda: print("Doing generation ("+arguments.name+") using CUDA") result, time, blocks, threads = call_utils.callCUDA(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter, block=arguments.blocks,thread=arguments.threads,save=arguments.save) elif arguments.procCpu: print("Doing generation ("+arguments.name+") using the CPU") time = call_utils.callCPU(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,save=arguments.save) print "("+("CUDA" if arguments.procCuda else "CPU")+") run took "+str(time)+"s."
def runQueueLoopComparison(args): """ wherein we investiagte the relative advantages of queueing blocks vs having bigger for-loops in order to get the same amount of work done. here we fix a thread-per-block count, solve for the set of integer values of blocks and pixels-per-thread that tile a fixed size image, and time the runs. """ dimx,dimy=args.dim resultL = allocate_cores(dimx, dimy, args.threads, silent=True) #resultL = ( (12, 1024, 2496 ), (6, 1024, 4992) ) #resultL = ( (195, 1024, 2496 ), (97, 1024, 4992) ) if len(resultL)==0: print 'No valid integer solutions to dimx*dimy = blocks*threads*ppt' exit(0) if args.mode != 4: timeL = [] pptL = [] blocksL = [] for ppt, threads, blocks in resultL: # eventually call 3-5 times and average try: result, time,blocksA, threadsA = call_utils.callCUDA(args.pos,args.zoom, (dimx,dimy),args.name,iterations=args.iter, block=blocks,thread=threads,save=args.save, mode=args.mode) except ValueError, e: print e continue if args.mode != 4: timeL.append(time) blocksL.append(blocks) pptL.append(ppt) print '%6d,%6d: %6d (%14s) %4d (%12s) %5d %f'%(dimx, dimy, blocks, blocksA, threads, threadsA, ppt, time) else: # should be zero, on average, really want to see if there are any zeros or numbers > 2 # overlap = np.sum(result)-(result.shape[0]*result.shape[1]) overlap = np.prod( result ) print '%6d,%6d: %6d (%14s) %4d (%12s) %5d %d (%d %d)'%(dimx, dimy, blocks, blocksA, threads, threadsA, ppt, overlap, np.amin(result), np.amax(result))
def cudaCollect(position,zoom,dimensions,execData,mode=0,iterations=100): """ Run callCUDA over a range of block and thread shapes and sizes, and collect data on time spent. """ # keep here so we only compile as needed global _data_file data = _data_file.root.TimingData.data meta = _data_file.root.TimingData.meta index = alreadyRan(position, dimensions, zoom, mode) # TODO: if index, check versions, and WARN if mismatch if not index: # need a new entry index = _get_new_meta_index() meta.row['index'] = index meta.row['pos_x'] = position[0] meta.row['pos_y'] = position[1] meta.row['dimensions_x'] = dimensions[0] meta.row['dimensions_y'] = dimensions[1] meta.row['zoom'] = zoom meta.row['mode'] = mode meta.row['iterations'] = iterations verD = get_version_info() for sym in ('os', 'nvidia', 'cuda_device', 'cuda_toolkit', 'gcc', 'python', 'numpy', 'pycuda', 'pytables', 'code_git'): meta.row['versioninfo/%s'%sym] = verD[sym] meta.row.append() meta.flush() for block in execData['blocks']: for thread in execData['threads']: # TODO: check to see if we have done this combo already for the given metadata try: name=str(block)+", "+str(thread) result,time,block_dim,thread_dim = callCUDA(position,zoom,dimensions,name, block=block,thread=thread,save=False,mode=mode) except ValueError: continue print "GOOD \t"+str(block)+", "+str(thread)+": "+str(time) data.row['metaIndexFK'] = index data.row['time'] = time data.row['block_x'] = block_dim[0] data.row['block_y'] = block_dim[1] data.row['blocks'] = block data.row['thread_x'] = thread_dim[0] data.row['thread_y'] = thread_dim[1] data.row['threads'] = thread if mode==4: data.row['overlap'] = np.sum(result)-(result.shape[0]*result.shape[1]) else: data.row['overlap']=0 data.row.append() data.flush() return index