def data_parallel_sum(a, b, c):
     """
     Vector addition using the ``kernel`` decorator.
     """
     i = dppy.get_global_id(0)
     j = dppy.get_global_id(1)
     c[i, j] = a[i, j] + b[i, j]
示例#2
0
def data_parallel_sum(a, b, c):
    """
    A two-dimensional vector addition example using the ``kernel`` decorator.
    """
    i = dppy.get_global_id(0)
    j = dppy.get_global_id(1)
    c[i, j] = a[i, j] + b[i, j]
示例#3
0
def dppy_gemm(a, b, c):
    i = numba_dppy.get_global_id(0)
    j = numba_dppy.get_global_id(1)
    if i >= c.shape[0] or j >= c.shape[1]:
        return
    c[i, j] = 0
    for k in range(c.shape[0]):
        c[i, j] += a[i, k] * b[k, j]
示例#4
0
def dppy_gemm(a, b, c):
    """
    A basic DGEMM implemented as a ``kernel`` function.
    """
    i = dppy.get_global_id(0)
    j = dppy.get_global_id(1)
    if i >= c.shape[0] or j >= c.shape[1]:
        return
    c[i, j] = 0
    for k in range(c.shape[0]):
        c[i, j] += a[i, k] * b[k, j]
示例#5
0
def sum_reduction_kernel(A, input_size, partial_sums):
    local_id = dppy.get_local_id(0)
    global_id = dppy.get_global_id(0)
    group_size = dppy.get_local_size(0)
    group_id = dppy.get_group_id(0)

    local_sums = dppy.local.array(64, int32)

    local_sums[local_id] = 0

    if global_id < input_size:
        local_sums[local_id] = A[global_id]

    # Loop for computing local_sums : divide workgroup into 2 parts
    stride = group_size // 2
    while stride > 0:
        # Waiting for each 2x2 addition into given workgroup
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)

        # Add elements 2 by 2 between local_id and local_id + stride
        if local_id < stride:
            local_sums[local_id] += local_sums[local_id + stride]

        stride >>= 1

    if local_id == 0:
        partial_sums[group_id] = local_sums[0]
示例#6
0
def sum_reduction_kernel(A, partial_sums):
    """
    The example demonstrates a reduction kernel implemented as a ``kernel``
    function.
    """
    local_id = dppy.get_local_id(0)
    global_id = dppy.get_global_id(0)
    group_size = dppy.get_local_size(0)
    group_id = dppy.get_group_id(0)

    local_sums = dppy.local.array(64, int32)

    # Copy from global to local memory
    local_sums[local_id] = A[global_id]

    # Loop for computing local_sums : divide workgroup into 2 parts
    stride = group_size // 2
    while stride > 0:
        # Waiting for each 2x2 addition into given workgroup
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)

        # Add elements 2 by 2 between local_id and local_id + stride
        if local_id < stride:
            local_sums[local_id] += local_sums[local_id + stride]

        stride >>= 1

    if local_id == 0:
        partial_sums[group_id] = local_sums[0]
示例#7
0
def black_scholes(nopt, price, strike, t, rate, vol, call, put):
    mr = -rate
    sig_sig_two = vol * vol * 2

    i = numba_dppy.get_global_id(0)

    P = price[i]
    S = strike[i]
    T = t[i]

    a = log(P / S)
    b = T * mr

    z = T * sig_sig_two
    c = 0.25 * z
    y = 1.0 / sqrt(z)

    w1 = (a - b + c) * y
    w2 = (a - b - c) * y

    d1 = 0.5 + 0.5 * erf(w1)
    d2 = 0.5 + 0.5 * erf(w2)

    Se = exp(b) * S

    r = P * d1 - Se * d2
    call[i] = r
    put[i] = r - P + Se
def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
    """
    A simple implementation of the Black-Scholes formula using explicit
    OpenCL-syle kernel programming model.
    """
    i = dppy.get_global_id(0)
    if i >= S.shape[0]:
        return
    sqrtT = math.sqrt(T[i])
    d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT)
    d2 = d1 - V * sqrtT

    K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1))
    cndd1 = (
        RSQRT2PI
        * math.exp(-0.5 * d1 * d1)
        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
    )
    if d1 > 0:
        cndd1 = 1.0 - cndd1

    K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2))
    cndd2 = (
        RSQRT2PI
        * math.exp(-0.5 * d2 * d2)
        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
    )
    if d2 > 0:
        cndd2 = 1.0 - cndd2

    expRT = math.exp((-1.0 * R) * T[i])
    callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
    putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
        def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
            i = dppy.get_global_id(0)
            if i >= S.shape[0]:
                return
            sqrtT = math.sqrt(T[i])
            d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (
                V * sqrtT
            )
            d2 = d1 - V * sqrtT

            K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1))
            cndd1 = (
                RSQRT2PI
                * math.exp(-0.5 * d1 * d1)
                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
            )
            if d1 > 0:
                cndd1 = 1.0 - cndd1

            K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2))
            cndd2 = (
                RSQRT2PI
                * math.exp(-0.5 * d2 * d2)
                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
            )
            if d2 > 0:
                cndd2 = 1.0 - cndd2

            expRT = math.exp((-1.0 * R) * T[i])
            callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
            putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
示例#10
0
    def reverse_array(A):
        lm = dppy.local.array(shape=10, dtype=np.float32)
        i = dppy.get_global_id(0)

        # preload
        lm[i] = A[i]
        # barrier local or global will both work as we only have one work group
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
        # write
        A[i] += lm[blocksize - 1 - i]
示例#11
0
def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
    idx = numba_dppy.get_global_id(0)
    minor_distance = -1
    for i in range(num_centroids):
        dx = arrayP[idx, 0] - arrayC[i, 0]
        dy = arrayP[idx, 1] - arrayC[i, 1]
        my_distance = numpy.sqrt(dx * dx + dy * dy)
        if minor_distance > my_distance or minor_distance == -1:
            minor_distance = my_distance
            arrayPcluster[idx] = i
    def private_memory_kernel(A):
        memory = numba_dppy.private.array(shape=1, dtype=np.float32)
        i = numba_dppy.get_global_id(0)

        # preload
        memory[0] = i
        numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence

        # memory will not hold correct deterministic result if it is not
        # private to each thread.
        A[i] = memory[0] * 2
示例#13
0
def pairwise_python(X1, X2, D):
    i = numba_dppy.get_global_id(0)

    N = X2.shape[0]
    O = X1.shape[1]
    for j in range(N):
        d = 0.0
        for k in range(O):
            tmp = X1[i, k] - X2[j, k]
            d += tmp * tmp
        D[i, j] = np.sqrt(d)
示例#14
0
def pairwise_distance(X, D, xshape0, xshape1):
    """
    An Euclidean pairwise distance computation implemented as
    a ``kernel`` function.
    """
    idx = dppy.get_global_id(0)

    # for i in range(xshape0):
    for j in range(X.shape[0]):
        d = 0.0
        for k in range(X.shape[1]):
            tmp = X[idx, k] - X[j, k]
            d += tmp * tmp
        D[idx, j] = sqrt(d)
示例#15
0
def sum_kernel(a, b, c):
    i = dppy.get_global_id(0)
    c[i] = a[i] + b[i]
示例#16
0
文件: dppy-basic.py 项目: intel/gdb
def data_parallel_sum(a, b, c):
    i = dppy.get_global_id(0)   # numba-kernel-breakpoint
    l1 = a[i]                   # second-line
    l2 = b[i]                   # third-line
    c[i] = l1 + l2              # fourth-line
示例#17
0
 def h(a, b):
     i = dppy.get_global_id(0)
     b[i] = g(a[i]) + 1
def copy_arrayC(arrayC, arrayP):
    i = numba_dppy.get_global_id(0)
    arrayC[i, 0] = arrayP[i, 0]
    arrayC[i, 1] = arrayP[i, 1]
示例#19
0
def mul_kernel(a, b, c):
    i = dppy.get_global_id(0)
    b[i] = a[i] * c
示例#20
0
def data_parallel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
    i = dppy.get_global_id(0)  # numba-kernel-breakpoint
    l1 = a_in_kernel[i]  # second-line
    l2 = b_in_kernel[i]  # third-line
    c_in_kernel[i] = l1 + l2  # fourth-line
 def private_memory_kernel(A):
     i = numba_dppy.get_global_id(0)
     prvt_mem = numba_dppy.private.array(shape=1, dtype=np.float32)
     prvt_mem[0] = i
     numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
     A[i] = prvt_mem[0] * 2
示例#22
0
 def f(a, b):
     i = dppy.get_global_id(0)
     b[i] = uop(a[i])
示例#23
0
 def dppy_f(array_like_obj):
     i = dppy.get_global_id(0)
     array_like_obj[i] = 10
示例#24
0
def reduction_kernel(A, R, stride):
    i = dppy.get_global_id(0)
    # sum two element
    R[i] = A[i] + A[i + stride]
    # store the sum to be used in nex iteration
    A[i] = R[i]
示例#25
0
def dppy_kernel(a_in_kernel, b_in_kernel, c_in_kernel):
    i = dppy.get_global_id(0)
    c_in_kernel[i] = dppy_loop_body(i, a_in_kernel, b_in_kernel)
def calCentroidsSum1(arrayCsum, arrayCnumpoint):
    i = numba_dppy.get_global_id(0)
    arrayCsum[i, 0] = 0
    arrayCsum[i, 1] = 0
    arrayCnumpoint[i] = 0
示例#27
0
def data_parallel_sum(a, b, c):
    i = dppy.get_global_id(0)
    c[i] = a[i] + b[i]
def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
    i = numba_dppy.get_global_id(0)
    ci = arrayPcluster[i]
    numba_dppy.atomic.add(arrayCsum, (ci, 0), arrayP[i, 0])
    numba_dppy.atomic.add(arrayCsum, (ci, 1), arrayP[i, 1])
    numba_dppy.atomic.add(arrayCnumpoint, ci, 1)
示例#29
0
def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
    i = dppy.get_global_id(0)
    c_in_kernel[i] = func_sum(a_in_kernel[i], b_in_kernel[i])
def updateCentroids(arrayC, arrayCsum, arrayCnumpoint, num_centroids):
    i = numba_dppy.get_global_id(0)
    arrayC[i, 0] = arrayCsum[i, 0] / arrayCnumpoint[i]
    arrayC[i, 1] = arrayCsum[i, 1] / arrayCnumpoint[i]