def test_vector_fill(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a_gpu = cl_array.Array(queue, 100, dtype=cltypes.float4) a_gpu.fill(cltypes.make_float4(0.0, 0.0, 1.0, 0.0)) a = a_gpu.get() assert a.dtype == cltypes.float4 a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
def test_vector_args(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) prg = cl.Program(context, """ __kernel void set_vec(float4 x, __global float4 *dest) { dest[get_global_id(0)] = x; } """).build() x = cltypes.make_float4(1, 2, 3, 4) dest = np.empty(50000, cltypes.float4) mf = cl.mem_flags dest_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=dest) prg.set_vec(queue, dest.shape, None, x, dest_buf) cl.enqueue_copy(queue, dest, dest_buf).wait() assert (dest == x).all()
def test_axpy(ctx_factory): logging.basicConfig(level="INFO") ctx = ctx_factory() n = 3145182 if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") for dtype, check, a, b in [ (np.complex64, None, 5, 7), ( cltypes.float4, check_float4, # pylint:disable=no-member cltypes.make_float4(1, 2, 3, 4), # pylint:disable=no-member cltypes.make_float4(6, 7, 8, 9)), # pylint:disable=no-member (np.float32, None, 5, 7), ]: knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["z[i] = a*x[i]+b*y[i]"], [ lp.ValueArg("a", dtype), lp.GlobalArg("x", dtype, shape="n,"), lp.ValueArg("b", dtype), lp.GlobalArg("y", dtype, shape="n,"), lp.GlobalArg("z", dtype, shape="n,"), lp.ValueArg("n", np.int32, approximately=n), ], name="axpy", assumptions="n>=1") seq_knl = knl def variant_cpu(knl): unroll = 16 block_size = unroll * 4096 knl = lp.split_iname(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", unroll, inner_tag="unr") return knl def variant_gpu(knl): unroll = 4 block_size = 256 knl = lp.split_iname(knl, "i", unroll * block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") return knl #for variant in [ variant_gpu]: for variant in [variant_cpu, variant_gpu]: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize * n * 3 / 1e9], op_label=["GBytes"], parameters={ "a": a, "b": b, "n": n }, check_result=check, blacklist_ref_vendors=["Advanced Micro"])
def csr_to_ocl_sell4(n_row, rowptr, colidx, val): """ Convert CSR format to Sliced ELLPACK format and slice height = 4. Parameters ---------- n_row : int Number of rows rowptr : ndarrays Row pointer of CSR format colidx : ndarrays Column index of CSR format val : ndarrays None zero elements value of CSR format Returns ------- slice_count : int Number of slices ell_colidx : ndarrays Column index of Sliced ELLPACK format ell_sliceptr : ndarrays Slice pointer of Sliced ELLPACK format ell_slicecol : ndarrays Column length of a slice ell_val : ndarrays None zero elements value of CSR format """ slice_height = 4 slice_number = math.floor(n_row / slice_height) # number of full slices slice_count = math.ceil(n_row / slice_height) # real number of slices nnz_count = 0 total_col_count = 0 ell_colidx = [] ell_sliceptr = [] ell_slicecol = [0] ell_val = [] for i in range(slice_number): max_nnz = 0 for s in range(slice_height): col_count = rowptr[i * slice_height + s + 1] - \ rowptr[i * slice_height + s] max_nnz = max(max_nnz, col_count) ell_sliceptr.append(nnz_count) total_col_count += max_nnz ell_slicecol.append(total_col_count) pre_idx = 0 for j in range(max_nnz): # column scan slice_row_val = [] slice_row_colidx = [] for k in range(slice_height): # row scan idx = i * slice_height + k # row index now_ptr = rowptr[idx] # start index of this row next_ptr = rowptr[idx + 1] # start index of next row nnz_count += 1 # count non-zero number if now_ptr + j < next_ptr: pre_idx = colidx[now_ptr + j] slice_row_colidx.append(colidx[now_ptr + j]) slice_row_val.append(val[now_ptr + j]) else: slice_row_colidx.append(pre_idx) slice_row_val.append(0) # padded zero # convert to vector int int4_slice_row_colidx = cltypes.make_int4(slice_row_colidx[0], slice_row_colidx[1], slice_row_colidx[2], slice_row_colidx[3]) # convert to vector float float4_slice_row_val = cltypes.make_float4(slice_row_val[0], slice_row_val[1], slice_row_val[2], slice_row_val[3]) ell_colidx.append(int4_slice_row_colidx) ell_val.append(float4_slice_row_val) if n_row % slice_height != 0: # if have remainder now_row = slice_number * slice_height remain_rows = n_row - now_row max_nnz = 0 for s in range(remain_rows): col_count = rowptr[now_row + s + 1] - rowptr[now_row + s] max_nnz = max(max_nnz, col_count) ell_sliceptr.append(nnz_count) total_col_count += max_nnz ell_slicecol.append(total_col_count) pre_idx = 0 for j in range(max_nnz): # column slice_row_val = [] slice_row_colidx = [] for k in range(slice_height): # row nnz_count += 1 # count non-zero number if k >= remain_rows: slice_row_colidx.append(0) slice_row_val.append(0) # padded zero else: idx = now_row + k # row index now_ptr = rowptr[idx] # start index of this row next_ptr = rowptr[idx + 1] # start index of next row if now_ptr + j < next_ptr: pre_idx = colidx[now_ptr + j] slice_row_colidx.append(colidx[now_ptr + j]) slice_row_val.append(val[now_ptr + j]) else: slice_row_colidx.append(pre_idx) slice_row_val.append(0) # padded zero # convert to vector int int4_slice_row_colidx = cltypes.make_int4(slice_row_colidx[0], slice_row_colidx[1], slice_row_colidx[2], slice_row_colidx[3]) # convert to vector float float4_slice_row_val = cltypes.make_float4(slice_row_val[0], slice_row_val[1], slice_row_val[2], slice_row_val[3]) ell_colidx.append(int4_slice_row_colidx) ell_val.append(float4_slice_row_val) ell_sliceptr.append(nnz_count) return slice_count, \ np.array(ell_colidx), \ np.array(ell_sliceptr, dtype=np.int32), \ np.array(ell_slicecol, dtype=np.int32), \ np.array(ell_val)