def data_parallel_sum(a, b, c): """ Vector addition using the ``kernel`` decorator. """ i = dppy.get_global_id(0) j = dppy.get_global_id(1) c[i, j] = a[i, j] + b[i, j]
def data_parallel_sum(a, b, c): """ A two-dimensional vector addition example using the ``kernel`` decorator. """ i = dppy.get_global_id(0) j = dppy.get_global_id(1) c[i, j] = a[i, j] + b[i, j]
def dppy_gemm(a, b, c): i = numba_dppy.get_global_id(0) j = numba_dppy.get_global_id(1) if i >= c.shape[0] or j >= c.shape[1]: return c[i, j] = 0 for k in range(c.shape[0]): c[i, j] += a[i, k] * b[k, j]
def dppy_gemm(a, b, c): """ A basic DGEMM implemented as a ``kernel`` function. """ i = dppy.get_global_id(0) j = dppy.get_global_id(1) if i >= c.shape[0] or j >= c.shape[1]: return c[i, j] = 0 for k in range(c.shape[0]): c[i, j] += a[i, k] * b[k, j]
def sum_reduction_kernel(A, input_size, partial_sums): local_id = dppy.get_local_id(0) global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) group_id = dppy.get_group_id(0) local_sums = dppy.local.array(64, int32) local_sums[local_id] = 0 if global_id < input_size: local_sums[local_id] = A[global_id] # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 if local_id == 0: partial_sums[group_id] = local_sums[0]
def sum_reduction_kernel(A, partial_sums): """ The example demonstrates a reduction kernel implemented as a ``kernel`` function. """ local_id = dppy.get_local_id(0) global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) group_id = dppy.get_group_id(0) local_sums = dppy.local.array(64, int32) # Copy from global to local memory local_sums[local_id] = A[global_id] # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 if local_id == 0: partial_sums[group_id] = local_sums[0]
def black_scholes(nopt, price, strike, t, rate, vol, call, put): mr = -rate sig_sig_two = vol * vol * 2 i = numba_dppy.get_global_id(0) P = price[i] S = strike[i] T = t[i] a = log(P / S) b = T * mr z = T * sig_sig_two c = 0.25 * z y = 1.0 / sqrt(z) w1 = (a - b + c) * y w2 = (a - b - c) * y d1 = 0.5 + 0.5 * erf(w1) d2 = 0.5 + 0.5 * erf(w2) Se = exp(b) * S r = P * d1 - Se * d2 call[i] = r put[i] = r - P + Se
def black_scholes_dppy(callResult, putResult, S, X, T, R, V): """ A simple implementation of the Black-Scholes formula using explicit OpenCL-syle kernel programming model. """ i = dppy.get_global_id(0) if i >= S.shape[0]: return sqrtT = math.sqrt(T[i]) d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT) d2 = d1 - V * sqrtT K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1)) cndd1 = ( RSQRT2PI * math.exp(-0.5 * d1 * d1) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) ) if d1 > 0: cndd1 = 1.0 - cndd1 K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2)) cndd2 = ( RSQRT2PI * math.exp(-0.5 * d2 * d2) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) ) if d2 > 0: cndd2 = 1.0 - cndd2 expRT = math.exp((-1.0 * R) * T[i]) callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
def black_scholes_dppy(callResult, putResult, S, X, T, R, V): i = dppy.get_global_id(0) if i >= S.shape[0]: return sqrtT = math.sqrt(T[i]) d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / ( V * sqrtT ) d2 = d1 - V * sqrtT K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1)) cndd1 = ( RSQRT2PI * math.exp(-0.5 * d1 * d1) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) ) if d1 > 0: cndd1 = 1.0 - cndd1 K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2)) cndd2 = ( RSQRT2PI * math.exp(-0.5 * d2 * d2) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) ) if d2 > 0: cndd2 = 1.0 - cndd2 expRT = math.exp((-1.0 * R) * T[i]) callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
def reverse_array(A): lm = dppy.local.array(shape=10, dtype=np.float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i]
def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids): idx = numba_dppy.get_global_id(0) minor_distance = -1 for i in range(num_centroids): dx = arrayP[idx, 0] - arrayC[i, 0] dy = arrayP[idx, 1] - arrayC[i, 1] my_distance = numpy.sqrt(dx * dx + dy * dy) if minor_distance > my_distance or minor_distance == -1: minor_distance = my_distance arrayPcluster[idx] = i
def private_memory_kernel(A): memory = numba_dppy.private.array(shape=1, dtype=np.float32) i = numba_dppy.get_global_id(0) # preload memory[0] = i numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # memory will not hold correct deterministic result if it is not # private to each thread. A[i] = memory[0] * 2
def pairwise_python(X1, X2, D): i = numba_dppy.get_global_id(0) N = X2.shape[0] O = X1.shape[1] for j in range(N): d = 0.0 for k in range(O): tmp = X1[i, k] - X2[j, k] d += tmp * tmp D[i, j] = np.sqrt(d)
def pairwise_distance(X, D, xshape0, xshape1): """ An Euclidean pairwise distance computation implemented as a ``kernel`` function. """ idx = dppy.get_global_id(0) # for i in range(xshape0): for j in range(X.shape[0]): d = 0.0 for k in range(X.shape[1]): tmp = X[idx, k] - X[j, k] d += tmp * tmp D[idx, j] = sqrt(d)
def sum_kernel(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i]
def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) # numba-kernel-breakpoint l1 = a[i] # second-line l2 = b[i] # third-line c[i] = l1 + l2 # fourth-line
def h(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) + 1
def copy_arrayC(arrayC, arrayP): i = numba_dppy.get_global_id(0) arrayC[i, 0] = arrayP[i, 0] arrayC[i, 1] = arrayP[i, 1]
def mul_kernel(a, b, c): i = dppy.get_global_id(0) b[i] = a[i] * c
def data_parallel_sum(a_in_kernel, b_in_kernel, c_in_kernel): i = dppy.get_global_id(0) # numba-kernel-breakpoint l1 = a_in_kernel[i] # second-line l2 = b_in_kernel[i] # third-line c_in_kernel[i] = l1 + l2 # fourth-line
def private_memory_kernel(A): i = numba_dppy.get_global_id(0) prvt_mem = numba_dppy.private.array(shape=1, dtype=np.float32) prvt_mem[0] = i numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = prvt_mem[0] * 2
def f(a, b): i = dppy.get_global_id(0) b[i] = uop(a[i])
def dppy_f(array_like_obj): i = dppy.get_global_id(0) array_like_obj[i] = 10
def reduction_kernel(A, R, stride): i = dppy.get_global_id(0) # sum two element R[i] = A[i] + A[i + stride] # store the sum to be used in nex iteration A[i] = R[i]
def dppy_kernel(a_in_kernel, b_in_kernel, c_in_kernel): i = dppy.get_global_id(0) c_in_kernel[i] = dppy_loop_body(i, a_in_kernel, b_in_kernel)
def calCentroidsSum1(arrayCsum, arrayCnumpoint): i = numba_dppy.get_global_id(0) arrayCsum[i, 0] = 0 arrayCsum[i, 1] = 0 arrayCnumpoint[i] = 0
def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i]
def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint): i = numba_dppy.get_global_id(0) ci = arrayPcluster[i] numba_dppy.atomic.add(arrayCsum, (ci, 0), arrayP[i, 0]) numba_dppy.atomic.add(arrayCsum, (ci, 1), arrayP[i, 1]) numba_dppy.atomic.add(arrayCnumpoint, ci, 1)
def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel): i = dppy.get_global_id(0) c_in_kernel[i] = func_sum(a_in_kernel[i], b_in_kernel[i])
def updateCentroids(arrayC, arrayCsum, arrayCnumpoint, num_centroids): i = numba_dppy.get_global_id(0) arrayC[i, 0] = arrayCsum[i, 0] / arrayCnumpoint[i] arrayC[i, 1] = arrayCsum[i, 1] / arrayCnumpoint[i]