示例#1
0
def runComparison(arguments):
    print "Comparing parameters for (" +(str(arguments.name) if arguments.save else "")+ "): ( (" +str(arguments.pos[0])+","+str(arguments.pos[1])+"), ",
    print str(arguments.zoom)+", ("+str(arguments.dim[0])+", "+str(arguments.dim[1])+") )"
        
    cpuTime = call_utils.callCPU(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,save=arguments.save)
    try:
        results, cudaTime, retblock, retthread = call_utils.callCUDA(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,
            block=arguments.blocks,thread=arguments.threads,save=arguments.save,mode=arguments.mode)
    except Exception, e:
        print e
        cudaTime = 'NA'
        pass
示例#2
0
def runGeneration(arguments):
    if arguments.procCuda:
        print("Doing generation ("+arguments.name+") using CUDA")

        result, time, blocks, threads = call_utils.callCUDA(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,
            block=arguments.blocks,thread=arguments.threads,save=arguments.save)

    elif arguments.procCpu:
        print("Doing generation ("+arguments.name+") using the CPU")
        time = call_utils.callCPU(arguments.pos,arguments.zoom,arguments.dim,arguments.name,iterations=arguments.iter,save=arguments.save)

    print "("+("CUDA" if arguments.procCuda else "CPU")+") run took "+str(time)+"s."        
示例#3
0
def runQueueLoopComparison(args):
    """
    wherein we investiagte the relative advantages of queueing blocks
    vs having bigger for-loops in order to get the same amount of work
    done.

    here we fix a thread-per-block count, solve for the set of integer
    values of blocks and pixels-per-thread that tile a fixed size
    image, and time the runs. 

    """
    dimx,dimy=args.dim
    resultL = allocate_cores(dimx, dimy, args.threads, silent=True)
    #resultL = ( (12, 1024, 2496 ), (6, 1024, 4992) )
    #resultL = ( (195, 1024, 2496 ), (97, 1024, 4992) )
    if len(resultL)==0:
        print 'No valid integer solutions to dimx*dimy = blocks*threads*ppt'
        exit(0)

    if args.mode != 4:
        timeL = []
        pptL = []
        blocksL = []
    for ppt, threads, blocks in resultL:
        # eventually call 3-5 times and average
        try:
            result, time,blocksA, threadsA = call_utils.callCUDA(args.pos,args.zoom, (dimx,dimy),args.name,iterations=args.iter,
                                                                 block=blocks,thread=threads,save=args.save, 
                                                                 mode=args.mode) 
        except ValueError, e:
            print e
            continue

        if args.mode != 4:
            timeL.append(time)
            blocksL.append(blocks)
            pptL.append(ppt)
            print '%6d,%6d:   %6d (%14s)  %4d (%12s) %5d   %f'%(dimx, dimy, blocks, blocksA, threads, threadsA, ppt, time)
        else:
            # should be zero, on average, really want to see if there are any zeros or numbers > 2
            # overlap =  np.sum(result)-(result.shape[0]*result.shape[1])
            overlap = np.prod( result )
            print '%6d,%6d:   %6d (%14s)  %4d (%12s) %5d   %d (%d %d)'%(dimx, dimy, blocks, blocksA, threads, threadsA, ppt, overlap, np.amin(result), np.amax(result))
示例#4
0
def cudaCollect(position,zoom,dimensions,execData,mode=0,iterations=100):
    """
    Run callCUDA over a range of block and thread shapes and sizes, and collect data on time spent. 
    """
    # keep here so we only compile as needed
    
    global _data_file

    data = _data_file.root.TimingData.data
    meta = _data_file.root.TimingData.meta

    index = alreadyRan(position, dimensions, zoom, mode)
    # TODO: if index, check versions, and WARN if mismatch
    if not index:
        # need a new entry
        index = _get_new_meta_index()
        meta.row['index'] = index
        meta.row['pos_x'] = position[0]
        meta.row['pos_y'] = position[1]
        meta.row['dimensions_x'] = dimensions[0]
        meta.row['dimensions_y'] = dimensions[1]
        meta.row['zoom'] = zoom
        meta.row['mode'] = mode
        meta.row['iterations'] = iterations

        verD = get_version_info()
        for sym in ('os', 'nvidia', 'cuda_device', 'cuda_toolkit', 'gcc', 
                    'python', 'numpy', 'pycuda',   'pytables', 'code_git'):
            meta.row['versioninfo/%s'%sym] = verD[sym]

        meta.row.append()
        meta.flush()

    for block in execData['blocks']:
        for thread in execData['threads']:
            # TODO: check to see if we have done this combo already for the given metadata      
            try:
                name=str(block)+", "+str(thread)
                result,time,block_dim,thread_dim = callCUDA(position,zoom,dimensions,name,
                                                            block=block,thread=thread,save=False,mode=mode)
            except ValueError:
                continue

            print "GOOD \t"+str(block)+", "+str(thread)+": "+str(time)
            
            data.row['metaIndexFK'] = index
            data.row['time'] = time
            data.row['block_x'] = block_dim[0]
            data.row['block_y'] = block_dim[1]
            data.row['blocks'] = block
            data.row['thread_x'] = thread_dim[0]
            data.row['thread_y'] = thread_dim[1]
            data.row['threads'] = thread
            if mode==4:     
                data.row['overlap'] = np.sum(result)-(result.shape[0]*result.shape[1])
            else:
                data.row['overlap']=0

            data.row.append()
            data.flush()
            
    return index