Пример #1
0
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
                          MAX_TPB = 512, stream = None):
    """
    All input (except MAX_TPB and stream) are device arrays.
    edges       : array with the IDs of the edges that will be part of the new graph
    n_edges     : array of 1 element with the number of valid edges in the edges array;
                  if n_edges < size of edges, the last elements of the edges array are
                  not considered
    """

    # check if number of valid edges was received
    if n_edges is None:
        edges_size = edges.size
        n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
    else:
        edges_size = int(n_edges.getitem(0))

    # check if a stream was received, if not create one
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream
    
    new_n_edges = edges_size * 2

    # allocate memory for new graph
    ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
                              stream = myStream)
    nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                stream = myStream)
    nfe = cuda.device_array_like(fe, stream = myStream)
    nod = cuda.device_array_like(od, stream = myStream)

    # fill new outdegree with zeros
    vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
    memSet[vertexGrid, MAX_TPB, myStream](nod, 0)

    # count all edges of new array and who they belong to
    edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
    countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)

    # get new first_edge array from new outdegree
    nfe.copy_to_device(nod, stream=myStream)
    ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)


    # copy new first_edge to top_edge to serve as pointer in adding edges
    top_edge = cuda.device_array_like(nfe, stream = myStream)
    top_edge.copy_to_device(nfe, stream = myStream)

    addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
                                          top_edge, ndest, nweight)

    del top_edge
    #del dest, weight, fe, od
    return ndest, nweight, nfe, nod
Пример #2
0
    def deserialize(cls, deserialize, header, frames):
        """Called when dask.distributed is performing a deserialization for
        data of this class.

        Do not use this directly.  It is invoked by dask.distributed.

        Parameters
        ----------

        deserialize : callable
             Used to deserialize data that needs further deserialization .
        header, frames : dict
            See custom serialization documentation in dask.distributed.

        Returns
        -------
        obj : Buffer
            Returns an instance of Buffer.
        """
        # Using IPC?
        if header['kind'] == 'ipc':
            ipch = deserialize(header['mem'], frames)
            # Open IPC handle
            with ipch as data:
                # Copy remote data over
                mem = cuda.device_array_like(data)
                mem.copy_to_device(data)
        # Not using IPC
        else:
            # Deserialize the numpy array
            mem = deserialize(header['mem'], frames)
            mem.flags['WRITEABLE'] = True  # XXX: hack for numba to work
        return Buffer(mem)
Пример #3
0
def lombscargle(
    x,
    y,
    freqs,
    precenter=False,
    normalize=False,
):

    pgram = cuda.device_array_like(freqs)

    assert x.ndim == 1
    assert y.ndim == 1
    assert freqs.ndim == 1

    # Check input sizes
    if x.shape[0] != y.shape[0]:
        raise ValueError("Input arrays do not have the same size.")

    y_dot = cuda.device_array(shape=(1, ), dtype=y.dtype)
    if normalize:
        cp.dot(y, y, out=y_dot)

    if precenter:
        y_in = y - y.mean()
    else:
        y_in = y

    _lombscargle(x, y_in, freqs, pgram, y_dot)

    return pgram
Пример #4
0
def few_copies():
    inp = np.arange(10000)
    d_inp = cuda.to_device(inp)
    d_out = cuda.device_array_like(inp)
    double[blocks, threadsPerBlock](d_inp, d_out)

    out = d_out.copy_to_host()
Пример #5
0
    def recall(self, x, how):
        if not isinstance(x, np.ndarray):
            x = np.array(x)

        assert np.size(x) == self._x_size

        orig_shape = x.shape
        x = x.reshape((self._x_size, 1))
        out = np.empty_like(x)

        if how == 'm':
            mem = self._dev_mem_m
            op = mnn.mat_morph_mul_min_plus
        elif how == 'w':
            mem = self._dev_mem_w
            op = mnn.mat_morph_mul_max_plus
        else:
            raise ValueError('Expected how=\'m\' or how=\'w\'')

        stream = cuda.stream()

        dev_x = cuda.to_device(x, stream=stream)
        out_x = cuda.device_array_like(out, stream=stream)

        op(mem, x, out_x, stream=stream)
        out_x.copy_to_host(out, stream=stream)

        stream.synchronize()

        return out.T.reshape(orig_shape)
Пример #6
0
def _request_transfer(key, remoteinfo):
    logger.info("rebuild from: %s for %r", remoteinfo, key)

    context = zmq.Context()
    socket = context.socket(zmq.REQ)
    socket.connect("tcp://{0}:{1}".format(*remoteinfo))

    myaddr = _global_addr[0]
    theiraddr = remoteinfo[0]
    if myaddr == theiraddr:
        # Same machine go by IPC
        logger.info("request by IPC")
        socket.send(pickle.dumps(('IPC', key)))
        rcv = socket.recv()
        ipch = pickle.loads(rcv)
        # Open IPC and copy to local context

        with ipch as data:
            copied = cuda.device_array_like(data)
            copied.copy_to_device(data)

        # Release
        _request_drop(socket, key)
        return copied
    else:
        # Different machine go by NET
        logger.info("request by NET: %s->%s", theiraddr, myaddr)
        socket.send(pickle.dumps(('NET', key)))
        rcv = socket.recv()
        output = cuda.to_device(pickle.loads(rcv))
        # Release
        _request_drop(socket, key)
        return output
Пример #7
0
    def _test_shared(self, arr):
        # Use a kernel that copies via shared memory to check loading and
        # storing different dtypes with shared memory. All threads in a block
        # collaborate to load in values, then the output values are written
        # only by the first thread in the block after synchronization.

        nelem = len(arr)
        nthreads = 16
        nblocks = int(nelem / nthreads)
        dt = nps.from_dtype(arr.dtype)

        @cuda.jit
        def use_sm_chunk_copy(x, y):
            sm = cuda.shared.array(nthreads, dtype=dt)

            tx = cuda.threadIdx.x
            bx = cuda.blockIdx.x
            bd = cuda.blockDim.x

            # Load this block's chunk into shared
            i = bx * bd + tx
            if i < len(x):
                sm[tx] = x[i]

            cuda.syncthreads()

            # One thread per block writes this block's chunk
            if tx == 0:
                for j in range(nthreads):
                    y[bd * bx + j] = sm[j]

        d_result = cuda.device_array_like(arr)
        use_sm_chunk_copy[nblocks, nthreads](arr, d_result)
        host_result = d_result.copy_to_host()
        np.testing.assert_array_equal(arr, host_result)
Пример #8
0
 def __init__(self, r_cut, r_buff=0.5, cell_guess=50, n_guess=150):
     system = Ctx.get_active()
     if system is None:
         raise ValueError("No active system!")
     self.system = system
     self.cell_guess = cell_guess
     self.n_guess = n_guess
     self.r_cut2 = r_cut ** 2
     self.r_buff2 = (r_buff / 2) ** 2
     self.gpu = system.gpu
     self.tpb = 64
     self.bpg = int(self.system.N // self.tpb + 1)
     # self.situ_zero = np.zeros(1, dtype=np.int32)
     self.update_counts = 0
     self.dist_funcs = {}
     self.cu_nlist, self.cu_check_build = self._gen_func()
     with cuda.gpus[self.gpu]:
         self.p_n_max = cuda.pinned_array((1,), dtype=np.int32)
         self.p_situation = cuda.pinned_array((1,), dtype=np.int32)
         self.d_last_x = cuda.device_array_like(self.system.d_x)
         self.d_n_max = cuda.device_array(1, dtype=np.int32)
         self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32)
         self.d_nc = cuda.device_array((self.system.N,), dtype=np.int32)
         self.d_situation = cuda.device_array(1, dtype=np.int32)
     self.clist = clist(r_cut, r_buff, cell_guess=self.cell_guess)
     self.neighbour_list()
     self.system.nlist = self  # register to system
Пример #9
0
def find_last(arr, val, compare="eq"):
    """
    Returns the index of the last occurrence of *val* in *arr*.
    Or the last occurrence of *arr* *compare* *val*, if *compare* is not eq
    Otherwise, returns -1.

    Parameters
    ----------
    arr : device array
    val : scalar
    compare: str ('gt', 'lt', or 'eq' (default))
    """
    found = cuda.device_array_like(arr)
    if found.size > 0:
        if compare == "gt":
            gpu_mark_gt.forall(found.size)(arr, val, found, -1)
        elif compare == "lt":
            gpu_mark_lt.forall(found.size)(arr, val, found, -1)
        else:
            if arr.dtype in ("float32", "float64"):
                gpu_mark_found_float.forall(found.size)(arr, val, found, -1)
            else:
                gpu_mark_found_int.forall(found.size)(arr, val, found, -1)
    from cudf.core.column import as_column

    found_col = as_column(found)
    max_index = found_col.max()
    return max_index
Пример #10
0
 def __init__(self, r_cut, r_buff=0.5, cell_guess=50):
     system = Ctx.get_active()
     if system is None:
         raise ValueError("Error, Initialize system first!")
     self.system = system
     self.ibox = np.asarray(np.floor(system.box / (r_cut + r_buff)),
                            dtype=np.int32)
     self.n_cell = int(np.multiply.reduce(self.ibox))
     self.cell_adj = np.ones(self.system.n_dim, dtype=np.int32) * 3
     self.gpu = system.gpu
     self.tpb = 64
     self.bpg = int(self.system.N // self.tpb + 1)
     self.bpg_cell = int(self.n_cell // self.tpb + 1)
     self.cell_guess = cell_guess
     # self.situ_zero = np.zeros(1, dtype=np.int32)
     global cu_cell_index, cu_cell_map, cu_cell_list
     cu_cell_index, cu_cell_map, cu_cell_list = _gen_func(
         system.dtype, system.n_dim)
     self.p_cell_max = cuda.pinned_array((1, ), dtype=np.int32)
     with cuda.gpus[self.gpu]:
         self.d_last_x = cuda.device_array_like(self.system.d_x)
         self.d_cells = cuda.device_array(self.system.d_x.shape[0],
                                          dtype=np.int32)
         self.d_cell_map = cuda.device_array((self.n_cell, 3**system.n_dim),
                                             dtype=np.int32)
         self.d_ibox = cuda.to_device(self.ibox)
         self.d_cell_adj = cuda.to_device(self.cell_adj)
         cu_cell_map[self.bpg_cell, self.tpb](self.d_ibox, self.d_cell_adj,
                                              self.d_cell_map)
         self.d_cell_list = cuda.device_array(
             (self.n_cell, self.cell_guess, self.system.n_dim + 1),
             dtype=self.system.dtype)
         self.d_cell_counts = cuda.device_array(self.n_cell, dtype=np.int32)
         self.d_cell_max = cuda.device_array(1, dtype=np.int32)
     self.update()
Пример #11
0
def test_1d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    print("\ntesting 1d")
    t = time.time()
    arr1, arr2 = np.random.randn(2, data_len).astype(dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in {} seconds".format(time.time() - t))

    print("\ntesting multiplication times")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)),
                      number=num_times)
    print("cuda_mult_1d time = {:.3e}".format(max(t) / num_times))

    #     t = time.time()
    #     mult_gpu_1d[blocksize, 32](d_arr1, d_arr2, d_result)
    #     cuda.synchronize()
    #     print("cuda_mult_1d time = {:.3e}".format(time.time() - t))

    print("\ntesting sum times")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)),
                      number=num_times)
    print("cuda_add_1d time = {:.3e}".format(max(t) / num_times))

    #     t = time.time()
    #     add_gpu_1d[blocksize, 32](d_arr1, d_arr2, d_result)
    #     print("cuda_add_1d time = {:.3e}".format(time.time() - t))

    print("\ntesting reduction times")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
    print("cuda vectorize time = {:.3e}".format(max(t) / num_times))

    t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
    print("sum_gpu time = {:.3e}".format(max(t) / num_times))
Пример #12
0
def scale(in_arr1, scaler):
    out_arr = cuda.device_array_like(in_arr1)
    array_len = len(in_arr1)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    scale_kernel[(number_of_blocks, ),
                 (number_of_threads, )](in_arr1, scaler, out_arr, array_len)
    return out_arr
Пример #13
0
def compute_stats(arr):
    """
    Returns (mean, variance)
    """
    mu = compute_mean(arr)
    tmp = cuda.device_array_like(arr)
    gpu_variance_step.forall(arr.size)(arr, mu, tmp)
    return mu, compute_mean(tmp)
Пример #14
0
def abs_arr(in_arr):
    out_arr = cuda.device_array_like(in_arr)
    array_len = len(in_arr)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    abs_kernel[(number_of_blocks, ), (number_of_threads, )](in_arr, out_arr,
                                                            array_len)
    return out_arr
Пример #15
0
def division(in_arr1, in_arr2):
    out_arr = cuda.device_array_like(in_arr1)
    array_len = len(in_arr1)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    binary_div[(number_of_blocks, ), (number_of_threads, )](in_arr1, in_arr2,
                                                            out_arr, array_len)
    return out_arr
Пример #16
0
def port_money_flow(asset_ind, pp_arr, volume_arr):
    out_arr = cuda.device_array_like(pp_arr)
    array_len = len(pp_arr)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    port_moneyflow_kernel[(number_of_blocks, ),
                          (number_of_threads, )](asset_ind, pp_arr, volume_arr,
                                                 out_arr, array_len)
    return out_arr
Пример #17
0
def onbalance_volume(close_arr, volume_arr):
    out_arr = cuda.device_array_like(close_arr)
    array_len = len(close_arr)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    onbalance_kernel[(number_of_blocks, ),
                     (number_of_threads, )](close_arr, volume_arr, out_arr,
                                            array_len)
    return out_arr
Пример #18
0
def main():
    inp = np.arange(10_000_000)
    d_out = cuda.device_array_like(inp)
    double[blocks, threadsPerBlock](cuda.to_device(inp), d_out)

    s = sum_reducer(d_out)
    exp = (len(inp) - 1) * len(inp)

    assert s == exp
Пример #19
0
def average_price(high_arr, low_arr, close_arr):
    out_arr = cuda.device_array_like(high_arr)
    array_len = len(high_arr)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    average_price_kernel[(number_of_blocks, ),
                         (number_of_threads, )](high_arr, low_arr, close_arr,
                                                out_arr, array_len)
    return out_arr
Пример #20
0
def copy_array(arr, out=None):
    if out is None:
        out = cuda.device_array_like(arr)
    assert out.size == arr.size
    if arr.is_c_contiguous() and out.is_c_contiguous():
        out.copy_to_device(arr)
    else:
        gpu_copy.forall(out.size)(arr, out)
    return out
Пример #21
0
def copy_array(arr, out=None):
    if out is None:
        out = cuda.device_array_like(arr)
    assert out.size == arr.size
    if arr.is_c_contiguous() and out.is_c_contiguous():
        out.copy_to_device(arr)
    else:
        gpu_copy.forall(out.size)(arr, out)
    return out
Пример #22
0
def lowhigh_diff(high_arr, low_arr):
    out_arr = cuda.device_array_like(high_arr)
    array_len = len(high_arr)
    number_of_blocks = \
        (array_len + (number_of_threads - 1)) // number_of_threads
    lowhigh_diff_kernel[(number_of_blocks, ),
                        (number_of_threads, )](high_arr, low_arr, out_arr,
                                               array_len)
    return out_arr
Пример #23
0
def port_true_range(asset_indicator, high_arr, low_arr, close_arr):
    out_arr = cuda.device_array_like(high_arr)
    array_len = len(high_arr)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    port_true_range_kernel[(number_of_blocks, ),
                           (number_of_threads, )](asset_indicator, high_arr,
                                                  low_arr, close_arr, out_arr,
                                                  array_len)
    return out_arr
Пример #24
0
def compute_signal(signal):
    signal_arr = signal.data.to_gpu_array()
    out_arr = cuda.device_array_like(signal_arr)
    number_of_threads = 256
    array_len = len(signal)
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    signal_kernel[(number_of_blocks, ),
                  (number_of_threads, )](signal_arr, out_arr, array_len)
    return out_arr
Пример #25
0
def recode(data, recode_table, na_value):
    """Recode data with the given recode table.
    And setting out-of-range values to *na_value*
    """
    newdata = cuda.device_array_like(data)
    recode_table = to_device(recode_table)
    blksz = 32 * 4
    blkct = min(16, max(1, data.size // blksz))
    gpu_recode[blkct, blksz](newdata, data, recode_table, na_value)
    return newdata
 def test_get_ipc_handle(self):
     # We don't attempt to close the IPC handle in this test because Numba
     # will be expecting a real IpcHandle object to have been returned from
     # get_ipc_handle, and it would cause problems to do so.
     arr = np.arange(2)
     d_arr = cuda.device_array_like(arr)
     ipch = d_arr.get_ipc_handle()
     ctx = cuda.current_context()
     self.assertTrue(ctx.memory_manager.get_ipc_handle_called)
     self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
Пример #27
0
def testProfile():
          
    #rvecs  = np.ones((NV, N), dtype=np.float32)
    #n=0
    #ss=[]
    finalarr=[]
    t_start = timeit.default_timer()
    for i in range(0,k_num):        
        sums   = numpy.zeros(len(arr[i].getMat()), dtype=numpy.float64)
        #sums1   = numpy.zeros(NV, dtype=numpy.float64)
       # sums2   = numpy.zeros(NV, dtype=numpy.float64)
        
        if(i==k_num-1):
            sums1   = numpy.zeros(len(arr[i].getMat()), dtype=numpy.float64)
            d_rvecs1 = cuda.to_device(arr[i].getMat())
            d_sums1 = cuda.device_array_like(sums1)
            vec_sum_row[len(arr[i].getMat()), threadsperblock](d_rvecs1,d_sums1)
            ss=d_sums1.copy_to_host(sums1)
            for value in ss:
                finalarr.append(value)
                
    #    print(matrix)
        else:
            d_rvecs = cuda.to_device(arr[i].getMat())
            d_sums = cuda.device_array_like(sums)
            t_start = timeit.default_timer()
            vec_sum_row[len(arr[i].getMat()), threadsperblock](d_rvecs,d_sums)
            t_end = timeit.default_timer()
            vv=d_sums.copy_to_host(sums)
            for value in vv:
                finalarr.append(value)
            #print(vv)
   # f=ss+vv
   
   # t_end = timeit.default_timer()
    #print(finalarr)
    print('gpu took ' + str(t_end - t_start) + ' seconds')
#    df = pd.DataFrame(finalarr)
#    df.to_csv('terminator.csv', index=False)
#   
#   
    return finalarr
Пример #28
0
 def _test_device_array_like_same(self, d_a):
     """
     Tests of device_array_like where shape, strides, dtype, and flags should
     all be equal.
     """
     d_a_like = cuda.device_array_like(d_a)
     self.assertEqual(d_a.shape, d_a_like.shape)
     self.assertEqual(d_a.strides, d_a_like.strides)
     self.assertEqual(d_a.dtype, d_a_like.dtype)
     self.assertEqual(d_a.flags['C_CONTIGUOUS'], d_a_like.flags['C_CONTIGUOUS'])
     self.assertEqual(d_a.flags['F_CONTIGUOUS'], d_a_like.flags['F_CONTIGUOUS'])
Пример #29
0
def add(a, b):
    a_device = cuda.to_device(a)
    b_device = cuda.to_device(b)
    gpu_result = cuda.device_array_like(a)
    threads_per_block = (16, 16)
    blocks_per_grid_x = int(math.ceil(a.shape[0] / threads_per_block[0]))
    blocks_per_grid_y = int(math.ceil(b.shape[1] / threads_per_block[1]))
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    _add[blocks_per_grid, threads_per_block](a_device, b_device, gpu_result)
    result = gpu_result.copy_to_host()
    return result
Пример #30
0
    def test_transpose_bool(self):
        for rows, cols in self.small_variants:
            with self.subTest(rows=rows, cols=cols):
                arr = np.random.randint(2, size=(rows, cols), dtype=np.bool_)
                transposed = arr.T

                d_arr = cuda.to_device(arr)
                d_transposed = cuda.device_array_like(transposed)
                transpose(d_arr, d_transposed)

                host_transposed = d_transposed.copy_to_host()
                np.testing.assert_array_equal(transposed, host_transposed)
Пример #31
0
def moving_average_signal(stock_df, n_fast, n_slow):
    ma_slow = ci.moving_average(stock_df['close'], n_slow).to_gpu_array()
    ma_fast = ci.moving_average(stock_df['close'], n_fast).to_gpu_array()
    out_arr = cuda.device_array_like(ma_fast)
    array_len = len(ma_slow)
    number_of_threads = 256
    number_of_blocks = (array_len +
                        (number_of_threads - 1)) // number_of_threads
    moving_average_signal_kernel[(number_of_blocks, ),
                                 (number_of_threads, )](ma_fast, ma_slow,
                                                        out_arr, array_len)
    return out_arr, ma_slow, ma_fast
Пример #32
0
 def Compute(self, ClassImage_GPU):
     self.ThreadBlockRelax()
     RelaxImage_GPU = cuda.device_array_like(ClassImage_GPU)
     CompatMatrix_GPU = cuda.to_device(self.CompatMatrix)
     class_dimGPU = cuda.to_device(np.array(classImg_dim, dtype=np.int16))
     self.relaxLabel[self.BlocksPerGridRelax,
                     self.ThreadsPerBlockRelax](RelaxImage_GPU,
                                                ClassImage_GPU,
                                                CompatMatrix_GPU,
                                                class_dimGPU)
     self.GPURelax = RelaxImage_GPU.copy_to_host()
     return self.GPURelax, RelaxImage_GPU
Пример #33
0
 def run(self, arr, k):
     if k >= MAX_FAST_UNIQUE_K:
         raise NotImplementedError('k >= {}'.format(MAX_FAST_UNIQUE_K))
     # setup mem
     outsz_ptr = cuda.device_array(shape=1, dtype=np.intp)
     out = cuda.device_array_like(arr)
     # kernel
     self._kernel[1, 64](arr, k, out, outsz_ptr)
     # copy to host
     unique_ct = outsz_ptr.copy_to_host()[0]
     if unique_ct < 0:
         raise ValueError('too many unique value (hint: increase k)')
     else:
         hout = out.copy_to_host()
         return hout[:unique_ct]
Пример #34
0
def task2():
    a = numpy.float32(2.0)  # Force value to be float32
    x = numpy.arange(NELEM, dtype="float32")
    y = numpy.arange(NELEM, dtype="float32")

    ### Task2 ###
    # a) Complete the memory transfer for x -> dx, y -> dy
    # b) Allocate device memory for dout
    # c) Transfer for out <- dout
    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dout = cuda.device_array_like(x)

    griddim = NUM_BLOCKS
    blockdim = NUM_THREADS
    saxpy[griddim, blockdim](a, dx, dy, dout)

    out = dout.copy_to_host()
    print("out =", out)

    if numpy.allclose(a * x + y, out):
        print("Correct result")
    else:
        print("Incorrect result")
aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32)

print('data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20)))

evt_total_begin = cuda.event()
evt_total_end = cuda.event()

evt_kernel_begin = cuda.event()
evt_kernel_end = cuda.event()

t_total_begin = timer()
evt_total_begin.record()

# explicity tranfer memory
d_aryA = cuda.to_device(aryA)
d_aryB = cuda.device_array_like(aryA)

evt_kernel_begin.record()

t_kernel_begin = timer()
cu_copy_array[BLOCKCOUNT, BLOCKSIZE](d_aryB, d_aryA)
t_kernel_end = timer()

evt_kernel_end.record()

aryB = d_aryB.copy_to_host()

evt_total_end.record()

evt_total_end.synchronize()
t_total_end = timer()
Пример #36
0
def fillna(data, mask, value):
    out = cuda.device_array_like(data)
    out.copy_to_device(data)
    configured = gpu_fill_masked.forall(data.size)
    configured(value, mask, out)
    return out
def connected_comps_gpu(dest_in, weight_in, firstEdge_in, outDegree_in,
                        MAX_TPB = 512, stream = None):
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream

    dest = cuda.to_device(dest_in, stream = myStream)
    weight = cuda.to_device(weight_in, stream = myStream)
    firstEdge = cuda.to_device(firstEdge_in, stream = myStream)
    outDegree = cuda.to_device(outDegree_in, stream = myStream)

    n_vertices = firstEdge.size
    n_edges = dest.size

    n_components = n_vertices

    # still need edge_id for conflict resolution in find_minedge
    edge_id = cuda.to_device(np.arange(n_edges, dtype = dest.dtype),
                                       stream = myStream)
    
    #labels = np.empty(n_vertices, dtype = dest.dtype)
    first_iter = True

    # initialize with name top_edge so we can recycle an array between iterations
    top_edge = cuda.device_array(n_components, dtype = dest.dtype,
                                 stream = myStream)
    labels = cuda.device_array(n_components, dtype = dest.dtype,
                               stream = myStream)

    converged = cuda.device_array(1, dtype = np.int8, stream = myStream)
    
    gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB)
    gridDim = compute_cuda_grid_dim(n_components, MAX_TPB)

    final_converged = False
    while(not final_converged):
        vertex_minedge = top_edge

        findMinEdge_CUDA[gridDim, MAX_TPB, myStream](weight, firstEdge,
                                                     outDegree, vertex_minedge,
                                                     dest)

        removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge)

        colors = cuda.device_array(shape = n_components, dtype = np.int32,
                                   stream = myStream)
        initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge,
                                                          colors)

        # propagate colors until convergence
        propagateConverged = False
        while(not propagateConverged):
            propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged)
            converged_num = converged.getitem(0, stream = myStream)
            propagateConverged = True if converged_num == 1 else False

        # first we build the flags in the new_vertex array
        new_vertex = vertex_minedge # reuse the vertex_minedge array as the new new_vertex
        buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex)

        # new_n_vertices is the number of vertices of the new contracted graph
        new_n_vertices = ex_prefix_sum_gpu(new_vertex, MAX_TPB = MAX_TPB, stream = myStream).getitem(0, stream = myStream)
        new_n_vertices = int(new_n_vertices)

        if first_iter:
            # first iteration defines labels as the initial colors and updates
            labels.copy_to_device(colors, stream = myStream)
            first_iter = False
        
        # other iterations update the labels with the new colors
        update_labels_single_pass_cuda[gridDimLabels, MAX_TPB, myStream](labels, colors, new_vertex)

        if new_n_vertices == 1:
            final_converged = True
            del new_vertex
            break

        newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB)
        
        # count number of edges for new supervertices and write in new outDegree
        newOutDegree = cuda.device_array(shape = new_n_vertices,
                                         dtype = np.int32,
                                         stream = myStream)
        memSet[newGridDim, MAX_TPB, myStream](newOutDegree, 0) # zero the newOutDegree array
        countNewEdges_CUDA[gridDim, MAX_TPB, myStream](colors, firstEdge,
                                                       outDegree, dest,
                                                       new_vertex, newOutDegree)

        # new first edge array for contracted graph
        newFirstEdge = cuda.device_array_like(newOutDegree, stream = myStream)
        # copy newOutDegree to newFirstEdge
        newFirstEdge.copy_to_device(newOutDegree, stream = myStream)
        new_n_edges = ex_prefix_sum_gpu(newFirstEdge, MAX_TPB = MAX_TPB,
                                        stream = myStream)

        new_n_edges = new_n_edges.getitem(0, stream = myStream)
        new_n_edges = int(new_n_edges)

        # if no edges remain, then MST has converged
        if new_n_edges == 0:
            final_converged = True
            del newOutDegree, newFirstEdge, new_vertex
            break

        # create arrays for new edges
        new_dest = cuda.device_array(new_n_edges, dtype = np.int32,
                                     stream = myStream)
        new_edge_id = cuda.device_array(new_n_edges, dtype = np.int32,
                                        stream = myStream)
        new_weight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                       stream = myStream)

        top_edge = cuda.device_array_like(newFirstEdge, stream = myStream)
        top_edge.copy_to_device(newFirstEdge, stream = myStream)

        # assign and insert new edges
        assignInsert_CUDA[gridDim, MAX_TPB, myStream](edge_id, dest, weight,
                                            firstEdge, outDegree, colors,
                                            new_vertex, new_dest, new_edge_id,
                                            new_weight, top_edge)

        # delete old graph
        del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors

        # write new graph
        n_components = newFirstEdge.size
        edge_id = new_edge_id
        dest = new_dest
        weight = new_weight
        firstEdge = newFirstEdge
        outDegree = newOutDegree
        gridDim = newGridDim

    returnLabels = labels.copy_to_host()

    del dest, weight, edge_id, firstEdge, outDegree, converged, labels

    return returnLabels
Пример #38
0
def main():
    # device = cuda.get_current_device()
    # maxtpb = device.MAX_THREADS_PER_BLOCK
    # warpsize = device.WARP_SIZE
    maxtpb = 512
    warpsize = 32

    # benchmark loop
    vary_warpsize = []

    baseline = []
    ilpx2 = []
    ilpx4 = []
    ilpx8 = []

    # For OSX 10.8 where the GPU is used for graphic as well,
    # increasing the following to 10 * 2 ** 20 seems to be necessary to
    # produce consistent result.
    approx_data_size = 1.5 * 2**20

    for multiplier in range(1, maxtpb // warpsize + 1):
        blksz = warpsize * multiplier
        gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8)
        print('kernel config [%d, %d]' % (gridsz, blksz))

        N = blksz * gridsz
        A = np.arange(N, dtype=np.float32)
        B = np.arange(N, dtype=np.float32)

        print('data size %dMB' % (N / 2.**20 * A.dtype.itemsize))

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)

        assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz)
        vary_warpsize.append(blksz)

        dC = cuda.device_array_like(A)
        basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC))
        expected_result = dC.copy_to_host()
        if basetime > 0:
            baseline.append(N / basetime)

        dC = cuda.device_array_like(A)
        x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x2time > 0:
            ilpx2.append(N / x2time)

        dC = cuda.device_array_like(A)
        x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x4time > 0:
            ilpx4.append(N / x4time)

        dC = cuda.device_array_like(A)
        x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC))
        np.testing.assert_allclose(expected_result, dC.copy_to_host())
        if x8time > 0:
            ilpx8.append(N / x8time)

    pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline')
    pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2')
    pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4')
    pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8')
    pylab.legend(loc=4)
    pylab.title(cuda.get_current_device().name)
    pylab.xlabel('block size')
    pylab.ylabel('float per second')
    pylab.show()
Пример #39
0
from timeit import default_timer as timer
import numpy as np
from numba import vectorize, float32, cuda

src = np.arange(10 ** 7, dtype=np.float32)
dst = np.empty_like(src)

@vectorize([float32(float32)], target='cuda')
def copy_kernel(src):
    return src

# Regular memory transfer

ts = timer()
d_src = cuda.to_device(src)
d_dst = cuda.device_array_like(dst)

copy_kernel(d_src, out=d_dst)

d_dst.copy_to_host(dst)
te = timer()

print('regular', te - ts)

del d_src, d_dst

assert np.allclose(dst, src)

# Pinned (pagelocked) memory transfer

with cuda.pinned(src, dst):