예제 #1
0
    def test_spawn_concurrent_compilation(self):
        # force CUDA context init
        cuda.get_current_device()
        # use "spawn" to avoid inheriting the CUDA context
        ctx = multiprocessing.get_context('spawn')

        q = ctx.Queue()
        p = ctx.Process(target=spawn_process_entry, args=(q,))
        p.start()
        try:
            err = q.get()
        finally:
            p.join()
        if err is not None:
            raise AssertionError(err)
        self.assertEqual(p.exitcode, 0, 'test failed in child process')
예제 #2
0
    def setup(self):
        for a, arg in enumerate(sys.argv):
            if arg.find('-c') > -1:
                self._config_path = sys.argv[a + 1]

            if arg.find('-hc') > -1:
                from numba import cuda
                gpu = cuda.get_current_device()
                print("name = %s" % gpu.name)
                print("maxThreadsPerBlock = %s" %
                      str(gpu.MAX_THREADS_PER_BLOCK))
                print("maxBlockDimX = %s" % str(gpu.MAX_BLOCK_DIM_X))
                print("maxBlockDimY = %s" % str(gpu.MAX_BLOCK_DIM_Y))
                print("maxBlockDimZ = %s" % str(gpu.MAX_BLOCK_DIM_Z))
                print("maxGridDimX = %s" % str(gpu.MAX_GRID_DIM_X))
                print("maxGridDimY = %s" % str(gpu.MAX_GRID_DIM_Y))
                print("maxGridDimZ = %s" % str(gpu.MAX_GRID_DIM_Z))
                print("maxSharedMemoryPerBlock = %s" %
                      str(gpu.MAX_SHARED_MEMORY_PER_BLOCK))
                print("asyncEngineCount = %s" % str(gpu.ASYNC_ENGINE_COUNT))
                print("canMapHostMemory = %s" % str(gpu.CAN_MAP_HOST_MEMORY))
                print("multiProcessorCount = %s" %
                      str(gpu.MULTIPROCESSOR_COUNT))
                print("warpSize = %s" % str(gpu.WARP_SIZE))
                print("unifiedAddressing = %s" % str(gpu.UNIFIED_ADDRESSING))
                print("pciBusID = %s" % str(gpu.PCI_BUS_ID))
                print("pciDeviceID = %s" % str(gpu.PCI_DEVICE_ID))

                exit()

            if arg.find('-h') > -1:
                print('.py  -c starten mit und einem pfad zu einer config')
                exit()

        self.load_config()
예제 #3
0
def gpu_release():
    '''
    Release gpu memory
    '''

    device = cuda.get_current_device()
    device.reset()
예제 #4
0
def init_server():
    _global_addr[0] = socket.gethostname()
    logger.info("host addr: %s", _global_addr[0])
    devnum = cuda.get_current_device().id
    th = threading.Thread(target=server_loop, args=[devnum])
    th.daemon = True
    th.start()
예제 #5
0
def cuda_factor(number, primes):
    device = cuda.get_current_device()
    ffactor = np.asarray([0] * len(primes))
    dfact = cuda.to_device(ffactor)
    d_primes = cuda.to_device(np.asarray(primes))

    limit = len(primes)

    getcontext().prec = 1000

    l = Decimal(limit) / Decimal(24)

    if l <= 1024:
        tpb = l
        bpg = 1
    else:
        tpb = 1024
        bpg = Decimal(l) / Decimal(tpb)

    itpb = int(math.ceil(tpb))
    ibpg = int(math.ceil(bpg))

    cu_fact[ibpg, itpb](d_primes, number, dfact)

    c = dfact.copy_to_host()
    k = []
    for d in c:
        if int(d) != 0:
            k.append(int(d))
    return k
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
예제 #7
0
def cuda_args(shape):
    """
    Compute the blocks-per-grid and threads-per-block parameters for use when
    invoking cuda kernels

    Parameters
    ----------
    shape: int or tuple of ints
        The shape of the input array that the kernel will parallelize over

    Returns
    -------
    tuple
        Tuple of (blocks_per_grid, threads_per_block)
    """
    if isinstance(shape, int):
        shape = (shape, )

    max_threads = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    # Note: We divide max_threads by 2.0 to leave room for the registers
    # occupied by the kernel. For some discussion, see
    # https://github.com/numba/numba/issues/3798.
    threads_per_block = int(ceil(max_threads / 2.0)**(1.0 / len(shape)))
    tpb = (threads_per_block, ) * len(shape)
    bpg = tuple(int(ceil(d / threads_per_block)) for d in shape)
    return bpg, tpb
예제 #8
0
def gpu_name():
    '''
	Query the GPU's properties via Numba to obtain the name of the device.
	'''
    device = cuda.get_current_device()
    name = device.name
    return (name)
예제 #9
0
def bloch_sim_batch_cuda3(Nexample, batch_size, Nk, PDr, T1r, T2r, dfr, b1r,
                          M0, trr, ter, far, ti):
    sim_out = np.zeros((Nexample, Nk), dtype=np.complex128)  #final output data
    batch_data = np.zeros((batch_size, Nk),
                          dtype=np.complex128)  #batch output data
    bT1r = np.zeros(batch_size, dtype=np.float32)  #batch T1, T2, PD, df arrays
    bT2r = np.zeros(batch_size, dtype=np.float32)
    bPDr = np.zeros(batch_size, dtype=np.float32)
    bdfr = np.zeros(batch_size, dtype=np.float32)
    bb1r = np.zeros(batch_size, dtype=np.float32)
    #set total number of threads on GPU
    device = cuda.get_current_device()
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(batch_size) / tpb))
    # batch loop
    for nb in range(Nexample // batch_size):
        #print('Doing batch %d/%d for applying Bloch sim' % (nb+1,Nexample//batch_size))
        bstart = nb * batch_size  #batch start index
        bstop = bstart + batch_size  #batch stop inex
        #print('%d:%d' % (bstart,bstop))
        bT1r = T1r[bstart:bstop]  #batch T1, T2, PD, df arrays
        bT2r = T2r[bstart:bstop]
        bPDr = PDr[bstart:bstop]
        bdfr = dfr[bstart:bstop]
        bb1r = b1r[bstart:bstop]
        #start the parallel computing on GPU
        bloch_sim_irssfp_cuda3[bpg,
                               tpb](batch_size, Nk, bPDr, bT1r, bT2r, bdfr,
                                    bb1r, M0, trr, ter, far, ti, batch_data)
        sim_out[bstart:bstop, :] = batch_data
    return sim_out
예제 #10
0
    def test_spawn_concurrent_compilation(self):
        # force CUDA context init
        cuda.get_current_device()
        # use "spawn" to avoid inheriting the CUDA context
        ctx = multiprocessing.get_context('spawn')

        q = ctx.Queue()
        p = ctx.Process(target=spawn_process_entry, args=(q, ))
        p.start()
        try:
            err = q.get()
        finally:
            p.join()
        if err is not None:
            raise AssertionError(err)
        self.assertEqual(p.exitcode, 0, 'test failed in child process')
예제 #11
0
def initialize_and_compile(mat_size, mat_type):
    shape = np.array([mat_size, mat_size])
    if mat_type.startswith('uint'):
        shape[1] = math.ceil(mat_size / float(mat_type[4:]))
    mat = cuda.to_device(np.zeros(tuple(shape), dtype=mat_type))
    global threadsperblock
    threadsperblock = (int(
        np.sqrt(cuda.get_current_device().MAX_THREADS_PER_BLOCK)), ) * 2
    blockspergrid = tuple(
        int(math.ceil(mat_size / threadsperblock[i])) for i in (0, 1))
    is_changed = cuda.device_array((1, ), dtype=bool)

    global size, matmul_method
    if mat_type == 'bool':
        matmul_method = matmul_bool[blockspergrid, threadsperblock]
    elif mat_type == 'uint8':
        matmul_method = matmul_uint[blockspergrid, threadsperblock]
        size = 8
    elif mat_type == 'uint32':
        matmul_method = matmul_uint[blockspergrid, threadsperblock]
        size = 32
    else:
        raise ValueError(
            'GPU multiplication of matrices type {} is not supported'.format(
                mat_type))

    matmul_method(mat, mat, mat, is_changed)
예제 #12
0
def reset_GPU():
    """
    reset and clear memory of the GPU 
    """
    from numba import cuda
    device = cuda.get_current_device()
    device.reset()
예제 #13
0
def hist_vec_by_r_cu(x, dr, r_bin, r_max, middle=None, gpu=0):
    r"""Summing vector based function to modulus based function.
    $f(r) := \int F(\bm{r})\delta(r-|\bm{r}|)\mathrm{d}\bm{r} / \int \delta(r-|\bm{r}|)\mathrm{d}\bm{r}$
    :param x: np.ndarray, input
    :param r: np.ndarray[ndim=2], x[len_1, len_2, ..., len_n] ~ (r1: len_1, r2: len_2, ..., r_n: len_n)
    x: (Nx, Ny, Nz) -> r: (3 (xyz), N), currently, only Nx == Ny == Nz is supported.
    :param r_bin: double, bin size of r
    :param r_max: double, max of r
    :param gpu: int gpu number
    :return: np.ndarray, averaged $F(x, y, ...) -> 1/(4\pi\r^2) f(\sqrt{x^2+y^2+...})$
    """
    dim = np.asarray(x.shape, dtype=np.int64)
    ret = np.zeros(int(r_max / r_bin) + 1, dtype=np.float)
    r_max2 = float((ret.shape[0] * r_bin)**2)
    cter = np.zeros(ret.shape, dtype=np.int64)
    x = x.ravel(order='F')
    if middle is None:
        middle = np.zeros(dim.shape[0], dtype=np.float64)
    with cuda.gpus[gpu]:
        device = cuda.get_current_device()
        tpb = device.WARP_SIZE
        bpg = int(x.shape[0] // tpb + 1)
        if np.issubdtype(x.dtype, np.dtype(np.complex)):
            x_real = np.ascontiguousarray(x.real)
            x_imag = np.ascontiguousarray(x.imag)
            ret_imag = np.zeros(int(r_max / r_bin) + 1, dtype=np.float)
            _cu_kernel_complex[bpg, tpb](x_real, x_imag, dim, middle, dr,
                                         r_bin, r_max2, ret, ret_imag, cter)
            ret = ret + ret_imag * 1j
        else:
            _cu_kernel[bpg, tpb](x, dim, middle, dr, r_bin, r_max2, ret, cter)
    cter[cter == 0] = 1
    return ret / cter
예제 #14
0
def main():
    A = np.ones((20, 50000), dtype=np.float32)
    B = np.ones((3072, 50000), dtype=np.float32)
    C = np.ones((20, 3072, 50000), dtype=np.float32)
    (Ni, Nj, Nk) = C.shape

    my_gpu = cuda.get_current_device()
    thread_ct = 8
    block_ct_x = int(math.ceil(float(Ni) / thread_ct))
    block_ct_y = int(math.ceil(float(Nj) / thread_ct))
    block_ct_z = int(math.ceil(float(Nk) / thread_ct))

    blockdim = thread_ct, thread_ct, thread_ct
    griddim = block_ct_x, block_ct_y, block_ct_z
    print("Threads per block:", blockdim)
    print("Blocks per grid:", griddim)

    start = timer()
    Cg = cuda.to_device(C)
    mult_kernel[griddim, blockdim](A, B, Cg)
    Cg.to_host()
    dt = timer() - start
    print("Computation done in %f s" % (dt))

    print('C[:3,1,1] = ', C[:3, 1, 1])
    print('C[-3:,1,1] = ', C[-3:, 1, 1])
예제 #15
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
예제 #16
0
def incoherent_scattering(traj, q, dq, q_vectors=None):
    ret = np.zeros((traj.shape[1], traj.shape[0]))
    n = int(np.round(q / dq))
    q_vecs = q_vectors
    if q_vecs is None:

        @nb.jit(nopython=True)
        def _generate_q_vecs(_n, _dq, _q, _shape):
            ret = []
            for _qq in np.ndindex(_shape):
                _q = 0
                for _qqq in _qq:
                    _q += (_qqq - _n)**2
                _q = _q**0.5 * _dq
                if abs(_q - _q) / _q < 1.5e-3:
                    ret.append(_qq)
            return ret

        shape = (n * 2, ) * 3
        q_vecs = _generate_q_vecs(n, dq, q, shape)
        q_vecs = np.asarray((np.array(q_vecs) - n) * dq, dtype=np.float64)
    print('Start with Q vecs:', q_vecs.shape)
    import time
    s = time.time()
    with cuda.gpus[2]:
        device = cuda.get_current_device()
        tpb = (device.WRAP_SIZE, ) * 2
        bpg = (math.ceil(traj.shape[0] / tpb[0]),
               math.ceil(traj.shape[0] / tpb[1]))
        _cu_kernel[bpg, tpb](traj, q_vecs, ret)
    print(time.time() - s)
    return ret.T / np.arange(traj.shape[0], 0,
                             -1)[:, None] / q_vecs.shape[0], q_vecs
예제 #17
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
예제 #18
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
예제 #19
0
def gpu_compute_capability():
    '''
	Query the GPU's properties via Numba to obtain the compute capability of the device.
	'''
    device = cuda.get_current_device()
    compute = device.compute_capability

    return (compute)
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2

    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
예제 #22
0
def GPUWrapper(data_out, device_id,
               photons_req_per_device, max_photons_per_device,
               muS, g,
               source_type, source_param1, source_param2,
               detector_params,
               max_N, max_distance_from_det,
               target_type, target_mask, target_gridsize,
               z_target, z_bounded, z_range, ret_cols):

    # TODO: These numbers can be optimized based on the device / architecture / number of photons
    threads_per_block = 256
    blocks = 64
    photons_per_thread = int(np.ceil(float(photons_req_per_device)/(threads_per_block * blocks)))
    max_photons_per_thread = int(np.ceil(float(max_photons_per_device)/(threads_per_block * blocks)))

    cuda.select_device(device_id)
    device = cuda.get_current_device()
    stream = cuda.stream()  # use stream to trigger async memory transfer

    # Keeping this piece of code here for now -potentially we need this in the future
  #  with compiler_lock:                        # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where
        # signature is the call to the function. So far I couldn't figure out what is the signature of the
        # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A)
        # But I couldn't get it to work yet.
   #     MC_cuda_kernel = cuda.jit(propPhotonGPU)


    data = np.ndarray(shape=(threads_per_block*blocks, photons_per_thread, 11), dtype=np.float32)
    photon_counters = np.ndarray(shape=(threads_per_block*blocks, 5), dtype=np.int)
    data_out_device = cuda.device_array_like(data, stream=stream)
    photon_counters_device = cuda.device_array_like(photon_counters, stream=stream)

    # Used to initialize the threads random states.
    rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=(np.random.randint(sys.maxsize)-128)+device_id, stream=stream)

    # Actual kernel call
    propPhotonGPU[blocks, threads_per_block](rng_states, data_out_device, photon_counters_device,
                                             photons_per_thread, max_photons_per_thread, muS, g,
                                             source_type, source_param1, source_param2,
                                             detector_params,
                                             max_N, max_distance_from_det,
                                             target_type, target_mask, target_gridsize,
                                             z_target, z_bounded, z_range)
    # Copy data back
    data_out_device.copy_to_host(data, stream=stream)
    photon_counters_device.copy_to_host(photon_counters, stream=stream)
    stream.synchronize()

    data = data.reshape(data.shape[0]*data.shape[1], data.shape[2])
    data = data[:, ret_cols]
    data_out[device_id][0] = data

    photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0))
    data_out[device_id][1] = photon_counters_aggr
예제 #23
0
    def compute_fid_score(self):
        for img_type in self.image_types:
            os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
            self.fid_value[img_type] = calculate_fid_given_paths([self.gt_path, self.model_path], None, low_profile=False)
            print("FID: ", self.fid_value[img_type])
            # self.fid_value[img_type] = calculate_fid_given_paths(paths=[self.gt_path, self.model_path], batch_size=self.batch_size, dims=self.fid_dims, num_workers=self.num_workers, mod_type='_' + img_type)

            device = cuda.get_current_device()
            device.reset()
예제 #24
0
def calc_psi(r, t, z, out):
    out[:] = 0
    K = r.shape[0]
    MY, MX = out.shape
    gpu = cuda.get_current_device()
    threadsperblock = gpu.MAX_THREADS_PER_BLOCK
    blockspergrid = m.ceil(K * MY * MX / threadsperblock)
    psi_kernel[blockspergrid, threadsperblock](r, t, z, out)
    return out
예제 #25
0
파일: __init__.py 프로젝트: mcvine/acc
def get_number_of_cores_current_device():
    device = cuda.get_current_device()
    sms = getattr(device, 'MULTIPROCESSOR_COUNT')
    cc = device.compute_capability
    cores_per_sm = cc_cores_per_SM_dict[cc]
    total_cores = cores_per_sm * sms
    print("GPU compute capability: ", cc)
    print("GPU total number of SMs: ", sms)
    print("total cores: ", total_cores)
    return total_cores
예제 #26
0
def _cumacula(
    fmod,
    deltaratio,
    t,
    theta_star,
    theta_spot,
    theta_inst,
    tstart,
    tend,
    c,
    d,
    Fab,
    TdeltaV=False,
):
    if fmod.dtype == "float32":
        numba_type = float32
    elif fmod.dtype == "float64":
        numba_type = float64

    if (str(numba_type)) in _kernel_cache:
        kernel = _kernel_cache[(str(numba_type))]
    else:
        sig = _numba_cumacula_signature(numba_type)
        if fmod.dtype == "float32":
            kernel = _kernel_cache[(str(numba_type))] = cuda.jit(
                sig, fastmath=True)(_numba_cumacula_32)
            print("Registers(32)", kernel._func.get().attrs.regs)
        elif fmod.dtype == "float64":
            kernel = _kernel_cache[(str(numba_type))] = cuda.jit(
                sig, fastmath=True)(_numba_cumacula_64)
            print("Registers(64)", kernel._func.get().attrs.regs)

    gpu = cuda.get_current_device()
    numSM = gpu.MULTIPROCESSOR_COUNT
    threadsperblock = (128, )
    blockspergrid = (numSM * 20, )

    kernel[blockspergrid, threadsperblock](
        fmod,
        deltaratio,
        t,
        theta_star,
        theta_spot,
        theta_inst,
        tstart,
        tend,
        c,
        d,
        Fab,
        TdeltaV,
    )
    cuda.synchronize()
예제 #27
0
    def test_compile_ptx_for_current_device(self):
        def add(x, y):
            return x + y

        args = (float32, float32)
        ptx, resty = compile_ptx_for_current_device(add, args, device=True)

        # Check we target the current device's compute capability, or the
        # closest compute capability supported by the current toolkit.
        device_cc = cuda.get_current_device().compute_capability
        cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc)
        target = f'.target sm_{cc[0]}{cc[1]}'
        self.assertIn(target, ptx)
예제 #28
0
def cu_cell_list_argsort(pos, box, ibox, gpu=0):
    n = pos.shape[0]
    n_cell = np.multiply.reduce(ibox)
    cell_id = np.zeros(n).astype(np.int64)
    with cuda.gpus[gpu]:
        device = cuda.get_current_device()
        tpb = device.WARP_SIZE
        bpg = ceil(n / tpb)
        cu_cell_ind[bpg, tpb](pos, box, ibox, cell_id)
    cell_list = np.argsort(cell_id)  # pyculib radixsort for cuda acceleration.
    cell_id = cell_id[cell_list]
    cell_counts = np.r_[0, np.cumsum(np.bincount(cell_id, minlength=n_cell))]
    return cell_list.astype(np.int64), cell_counts.astype(np.int64)
예제 #29
0
def Clear():
    from numba import cuda
    device = cuda.get_current_device()
    device.reset()
    #cuda.current_context().trashing.clear()
    s = cuda.current_context().get_memory_info()
    print(s)
    cuda.current_context().deallocations.clear()
    s = cuda.current_context().get_memory_info()
    print(s)
    cuda.select_device(0)
    #do tf stuff
    cuda.close()
예제 #30
0
def AtF2(z, psi, r, out):
    """

    :param z:   K x MY x MX
    :param psi: B x K x MY x MX
    :param r:   K x 2
    :param out: B x NY x NX
    :return:
    """
    gpu = cuda.get_current_device()
    threadsperblock = gpu.MAX_THREADS_PER_BLOCK
    blockspergrid = m.ceil(np.prod(z.shape) / threadsperblock)
    AtF2_kernel[blockspergrid, threadsperblock](z, psi, r, out)
    return out
예제 #31
0
파일: nn.py 프로젝트: alcides/bigdatagpunn
def train_cuda(X, y, conf, iterations=6000):
    gpu = cuda.get_current_device()
    syn0, syn1 = conf
    syn0g = cuda.to_device(syn0)
    syn1g = cuda.to_device(syn1)
    Xg = cuda.to_device(X)
    yg = cuda.to_device(y)
    rows = X.shape[0]
    thread_ct = (gpu.WARP_SIZE, gpu.WARP_SIZE)
    block_ct = map(lambda x: int(math.ceil(float(x) / thread_ct[0])), [rows, ndims])
    train_kernel[block_ct, thread_ct](Xg, yg, syn0g, syn1g, iterations)
    syn0g.to_host()
    syn1g.to_host()
    return (syn0, syn1)
예제 #32
0
    def __init__(self, trainPath='proxy'):
        self.__name = trainPath
        self.__model = models.Sequential()
        self.__trainPath = os.path.join(learnDir, 'train', trainPath)
        self.__testPath = os.path.join(learnDir, 'test')
        self.__modelPath = os.path.join(main, 'models')
        self.__modelType = ''

        self.__history = None
        self.__train_gen = ImageDataGenerator()
        self.__val_gen = ImageDataGenerator()
        self.__gpu = cuda.get_current_device()
        self.__batchSize = 64
        self.__seed = 101
        self.__xSize = 224
        self.__ySize = 224
예제 #33
0
def Qoverlap_real2(r, z, out):
    """

    :param r: K x 2
    :param z: BB x K x MY x MX
    :param out: BB x NY x NX
    :return: out
    """
    BB = out.shape[0]
    K = r.shape[0]
    out[:] = 1
    gpu = cuda.get_current_device()
    threadsperblock = gpu.MAX_THREADS_PER_BLOCK
    blockspergrid = m.ceil(BB * K * np.prod(z.shape) / threadsperblock)
    overlap_kernel_real2[blockspergrid, threadsperblock](r, z, out)
    return out
예제 #34
0
def calc_psi_denom(r, t, out):
    """

    :param r: K x 2
    :param t: BB x NY x NX
    :param out: BB x MY x MX
    :return:
    """

    out[:] = 0
    K = r.shape[0]
    BB, MY, MX = out.shape
    gpu = cuda.get_current_device()
    threadsperblock = gpu.MAX_THREADS_PER_BLOCK
    blockspergrid = m.ceil(BB * K * MY * MX / threadsperblock)
    psi_denom_kernel[blockspergrid, threadsperblock](r, t, out)
    return out
예제 #35
0
def cuda_all_euc_dists(coords_arr):
    """
    Pass an array of shape (models, side, dimensions), return all the
    euclidean distances between each coord in each model.
    """
    num_threads = cuda.get_current_device().WARP_SIZE
    models, side, _ = coords_arr.shape

    out_arr = np.zeros((side, side, models))

    # What block dims?
    tpb = (32, 16, 2)

    # Given threads per block, what should blocks per grid be?
    bpg = _grid_dim(out_arr, tpb)

    cuda_all_euc_dists_inner[bpg, tpb](coords_arr, out_arr)
    return (out_arr)
예제 #36
0
def device_controller(cid):
    cuda.select_device(cid)                    # bind device to thread
    device = cuda.get_current_device()         # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY)

    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:                        # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread))

    # real CUDA work
    d_data = cuda.to_device(data)                   # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)     # compute inplace
    d_data.copy_to_host(data)                       # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = PRNG(PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
예제 #38
0
from numba import cuda
import numpy as np
import cProfile

@cuda.jit
def increment_by_one(an_array):
    tx = cuda.threadIdx.x
    bidx = cuda.blockIdx.x
    bdim = cuda.blockDim.x
    idx = tx + bidx * bdim
    if idx < an_array.size:
        an_array[idx] += 1


dev = cuda.get_current_device()
print('Cuda device name = {}'.format(dev.name))

a = np.ones(1000000)
thread_per_block = 512
block_per_grid = int(np.ceil(a.size / thread_per_block))
increment_by_one[block_per_grid, thread_per_block](a)

profile = cProfile.Profile()
profile.enable()

d_a = cuda.to_device(a)
for i in range(10000):
    increment_by_one[block_per_grid, thread_per_block](d_a)
    # a += 1
d_a.copy_to_host(a)
예제 #39
0
def main():
    # device = cuda.get_current_device()
    # maxtpb = device.MAX_THREADS_PER_BLOCK
    # warpsize = device.WARP_SIZE
    maxtpb = 512
    warpsize = 32

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print('kernel config [%d, %d]' % (gridsz, blksz))

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print('data size %dMB' % (N / 2.**20 * A.dtype.itemsize))

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()