def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000, ), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000, ), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), slice(1000, None, 3), slice(1000, 1000), ]: sum_a = np.sum(a[slc]) if sum_a: ref_divisor = abs(sum_a) else: ref_divisor = 1 if slc.step is None: sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / ref_divisor < 1e-4 sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() assert abs(sum_a_gpu_2 - sum_a) / ref_divisor < 1e-4
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), slice(1000, None, 3), ]: sum_a = np.sum(a[slc]) if slc.step is None: sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = "__pypy__" in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = '__pypy__' in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n, ), dtype) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4