示例#1
0
def test2():
    """
    This function is aimed at testing the performance of initializing the CUDA
    kernel with differing values of blocks per grid and threads per block.
    
    Careful: It is very unstable and will most likely crash your display driver
    unless you set safe ranges. If you're on Windows 7 or newer, it will just
    be restarted, however, but you will need to restart the Python session to
    restart the device hook from NumbaPro if you're running this interactively.
    
    TODO: Generate a better visualization for this -- 3d plots are bad...
    """
    n = 50e6 # See test1() for why this is an unbiased genome size
    trials = 20
    w = 16 # window size
    pssm = np.random.rand(4 * w) # generate PSSM
    bpg_range = (16, cuda.get_current_device().MAX_GRID_DIM_X*0.5)
    tpb_range = (32, cuda.get_current_device().MAX_BLOCK_DIM_X*0.5)
    
    data = []
    for t in range(trials):
        for bpg_exp in range(int(math.ceil(math.log(bpg_range[0], 2))), int(math.ceil(math.log(bpg_range[1], 2)) + 1)):
            bpg = 2 ** bpg_exp
            for tpb_exp in range(int(math.ceil(math.log(tpb_range[0], 2))), int(math.ceil(math.log(tpb_range[1], 2)) + 1)):
                tpb = 2 ** tpb_exp
                print t, bpg, tpb
                seq = np.random.randint(0, 3, int(n))
                __, __, run_info = score_sequence(seq, pssm, False, True, bpg, tpb)
                data.append((bpg, tpb, run_info["genome_size"] / run_info["runtime"]))
    x = [pt[0] for pt in data]
    y = [pt[1] for pt in data]
    z = [pt[2] for pt in data]
    
    
    xlabel('Blocks per Grid'), ylabel('Threads per Block')
示例#2
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()
示例#3
0
def gpumulti(X,mu):
    device = cuda.get_current_device()
    
    n=len(X)
    X=np.array(X)
    x1 = np.array(X.T[0])
    x2 = np.array(X.T[1])
    
    bmk = np.arange(len(x1))
    
    mu = np.array(mu)
    
    dx1 = cuda.to_device(x1)
    dx2 = cuda.to_device(x2)
    dmu = cuda.to_device(mu)
    dbmk = cuda.to_device(bmk)
    
    # Set up enough threads for kernel
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(n)/tpb))
        
    cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
    
    bestmukey = dbmk.copy_to_host()
    
    return bestmukey
示例#4
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
示例#5
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
示例#6
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
def cuda_factor(number, primes):
    device = cuda.get_current_device()
    ffactor = np.asarray([1])
    dfact = cuda.to_device(ffactor)
    d_primes = cuda.to_device(np.asarray(primes))

    tpb = 720
    bpg = 334
    cu_fact[bpg, tpb](d_primes, number, dfact)
    c = dfact.copy_to_host()
    return c
示例#9
0
def cuda_factor(number, primes):
    device = cuda.get_current_device()
    ffactor = np.asarray([0] * len(primes))
    dfact = cuda.to_device(ffactor)
    d_primes = cuda.to_device(np.asarray(primes))

    tpb = 720
    bpg = 334
    start = timer()
    cu_fact[bpg, tpb](d_primes, number, dfact)
    total = timer() - start
    print "Time taken : ", total
    c = dfact.copy_to_host()
    k = []
    for d in c:
        if int(d) != 0:
            k.append(int(d))
    return k
def device_controller(cid):
    cuda.select_device(cid)                    # bind device to thread
    device = cuda.get_current_device()         # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print( prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY )
    
    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:                        # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print( prefix, 'grid x thread = %d x %d' % (ngrid, nthread) )

    # real CUDA work
    d_data = cuda.to_device(data)                   # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)     # compute inplace
    d_data.copy_to_host(data)                       # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError
def device_controller(cid):
    cuda.select_device(cid)  # bind device to thread
    device = cuda.get_current_device()  # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY)

    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:  # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread))

    # real CUDA work
    d_data = cuda.to_device(data)  # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)  # compute inplace
    d_data.copy_to_host(data)  # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
def mc_cuda(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # instantiate a CUDA stream for queueing async CUDA cmds
    stream = cuda.stream()
    # instantiate a cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # configure the kernel
    # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>>
    step_cfg = step_cuda[gridsz, blksz, stream]
    
    # transfer the initial prices
    d_last = cuda.to_device(paths[:, 0], stream=stream)
    for j in range(1, paths.shape[1]):
        # call cuRAND to populate d_normdist with gaussian noises
        prng.normal(d_normdist, mean=0, sigma=1)
        # setup memory for new prices
        # device_array_like is like empty_like for GPU
        d_paths = cuda.device_array_like(paths[:, j], stream=stream)
        # invoke step kernel asynchronously
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        # transfer memory back to the host
        d_paths.copy_to_host(paths[:, j], stream=stream)
        d_last = d_paths
    # wait for all GPU work to complete
    stream.synchronize()
示例#15
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2

    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [
        int(math.ceil(float(partlen) / blksz)) for partlen in partlens
    ]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [
        curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist
    ]

    # Allocate device side array
    d_normlist = [
        cuda.device_array(partlen, dtype=np.double, stream=strm)
        for partlen, strm in zip(partlens, strmlist)
    ]

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [
        cu_step[gridsz, blksz, strm]
        for gridsz, strm in zip(gridszlist, strmlist)
    ]

    d_lastlist = [
        cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
        for (s, e), strm in zip(partitions, strmlist)
    ]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [
            cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm))
            for (s, e), strm in zip(partitions, strmlist)
        ]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist,
                                            d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
示例#16
0
def main():
    device = cuda.get_current_device()
    maxtpb = device.MAX_THREADS_PER_BLOCK
    warpsize = device.WARP_SIZE

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print 'kernel config [%d, %d]' % (gridsz, blksz)

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz // 2, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz // 4, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz // 8, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()

@cuda.jit('void(float32[:], float32[:], float32[:])')
def cu_add(a, b, c):
    # i = cuda.grid(1)
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    i = tx + bx * bw

    if i > c.size:
        return
    c[i] = a[i] + b[i]


if __name__ == '__main__':
    gpu = cuda.get_current_device()

    n = 100
    a = np.arange(n, dtype=np.float32)
    b = np.arange(n, dtype=np.float32)
    c = np.empty_like(a)

    nthreads = gpu.WARP_SIZE
    nblocks = int(np.ceil(float(n) / nthreads))
    print 'Blocks per grid:', nblocks
    print 'Threads per block', nthreads

    cu_add[nblocks, nthreads](a, b, c)
    print c
示例#18
0
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1):
    """
    This function will score a sequence of nucleotides based on a PSSM by using
    a sliding window parallelized on a GPU.
    
    Args:
        seq: This must be an integer representation of the nucleotide sequence,
            where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a 
            vector (1D array) of integers that can be cast to int32 (See: 
            numpy.int32).
        pssm: This must a vectorized PSSM where every four elements correspond 
            to one position. Make sure this can be cast to an array of float64.
        verbose: Set this to True to print performance information.
        benchmark: If set to True, the function will return information about
            the run in a dictionary at the third output variable.
        keep_strands: Whether memory should be allocated for storing which
            strand the scores come from. Set this to False if you just want the
            scores and the strands array will not be returned.
            NOTE: If this and benchmark are set to False, then the scores will
            not be returned in a tuple, meaning:
                >>> score_sequence
        blocks_per_grid: This is the blocks per grid that will be assigned to 
            the CUDA kernel. See this SO question for info on choosing this
            value: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid
            It defaults to the length of the sequence or the maximum number of
            blocks per grid supported by the GPU, whichever is lower.
            Set this to a negative number
        threads_per_block: Threads per block. See above. It defaults to 55% of
            the maximum number of threads per block supported by the GPU, a
            value determined experimentally. Higher values will likely result
            in failure to allocate resources to the kernel (since there will
            not be enough register space for each thread).
        
    Returns:
        scores: 1D float64 array of length (n - w + 1), where n is the length
            of the sequence and w is the window size. The value at index i of
            this array corresponds to the score of the n-mer at position i in 
            the sequence.
        strands: 1D int32 array of length (n - w + 1). The value at position i
            is either 0 or 1 corresponding to the strand of the score at that
            position where 0 means the forward strand and 1 means reverse.
        run_info: This is a dictionary that is returned if the benchmark
            parameter is set to True. It contains the following:
            >>> run_info.keys()
            ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid']
            Note that the memory_used is rather misleading if running the
            function more than once. CUDA is optimized to not transfer the same
            data from the host to the device so it will not always change. It
            may also unload other assets from memory, so the memory changed can
            be negative.
            TODO: Find a better method of calculating memory usage.
            
    Example:
        >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16
        >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence
        >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True)
        Threads per block = 563
        Blocks per grid = 53286
        Total threads = 30000018
        Scoring... Done.
        Genome size: 3e+07 bp
        Time: 605.78 ms
        Speed: 4.95229e+07 bp/sec
        >>> scores
        array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401,
               -32.64575614, -23.97110103])
        >>> strands
        array([1, 1, 1, ..., 1, 1, 0])
        >>> run_info
        {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286}
        
    A more interesting interpretation of the run information for performance 
    analysis is the number of bases score per second:
        >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"]
        1.06127e+08 bases/sec
    """
    w = int(pssm.size / 4) # width of PSSM
    n = int(seq.size) # length of the sequence being scored
    
    # Calculate the reverse-complement of the PSSM
    pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1])

    # Calculate the appropriate threads per block and blocks per grid    
    if threads_per_block <= 0 or blocks_per_grid <= 0:
        # We don't use the max number of threads to avoid running out of
        # register space by saturating the streaming multiprocessors
        # ~55% was found empirically, but your mileage may vary with different GPUs
        threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55)
        
        # We saturate our grid and let the dynamic scheduler assign the blocks
        # to the discrete CUDA cores/streaming multiprocessors
        blocks_per_grid = int(math.ceil(float(n) / threads_per_block))
        if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X:
            blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X
    
    if verbose:
        print "Threads per block = %d" % threads_per_block
        print "Blocks per grid = %d" % blocks_per_grid
        print "Total threads = %d" % (threads_per_block * blocks_per_grid)
    
    # Collect benchmarking info
    s = default_timer()
    start_mem = cuda.current_context().get_memory_info()[0]
    
    # Start a stream
    stream = cuda.stream()
    
    # Copy data to device
    d_pssm = cuda.to_device(pssm.astype(np.float64), stream)
    d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream)
    d_seq = cuda.to_device(seq.astype(np.int32), stream)
    
    # Allocate memory on device to store results
    d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream)
    if keep_strands:
        d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream)
        
    # Run the kernel
    if keep_strands:
        cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands)
    else:
        cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores)
    
    # Copy results back to host
    scores = d_scores.copy_to_host(stream=stream)
    if keep_strands:
        strands = d_strands.copy_to_host(stream=stream)
    stream.synchronize()
    
    # Collect benchmarking info
    end_mem = cuda.current_context().get_memory_info()[0]
    t = default_timer() - s
    
    # Output info on the run if verbose parameter is true
    if verbose:
        print "Genome size: %g bp" % n
        print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__)
        print "Speed: %g bp/sec" % (n / t)
        print "Global memory: %d bytes used (%.2f%% of total)" % \
            (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1])
    
    # Return the run information for benchmarking
    run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \
                "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block}
                
    # I'm so sorry BDFL, please don't hunt me down for returning different size
    # tuples in my function
    if keep_strands:
        if benchmark:
            return (scores, strands, run_info)
        else:
            return (scores, strands)
    else:
        if benchmark:
            return (scores, run_info)
        else:
            # Careful! This won't return a tuple, so you don't need to do
            # score_sequence[0] to get the scores
            return scores
示例#19
0
def main():
    device = cuda.get_current_device()
    maxtpb = device.MAX_THREADS_PER_BLOCK
    warpsize = device.WARP_SIZE

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print 'kernel config [%d, %d]' % (gridsz, blksz)

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print 'data size %dMB' % (N / 2.**20 * A.dtype.itemsize)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)
        

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC))
        assert np.allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()
示例#20
0
##
## from: https://people.duke.edu/~ccc14/sta-663/CUDAPython.html#more-examples
##

from numbapro import cuda, vectorize, guvectorize, check_cuda
from numbapro import void, uint8 , uint32, uint64, int32, int64, float32, float64, f8
import numpy as np
from timeit import default_timer as timer
import numbapro.cudalib.cublas as cublas



check_cuda()
device = cuda.get_current_device()




### naive matrix multiplication
"""
x1 = np.random.random((4,4))
x2 = np.random.random((4,4))
np.dot(x1, x2).shape
"""



### Kernel function (no shared memory)
@cuda.jit('void(float32[:,:], float32[:,:], float32[:,:], int32)')
def cu_matmul(a, b, c, n):
    x, y = cuda.grid(2)