def check_performance(ctx_and_double, shape1, shape2, bwo): ctx, double = ctx_and_double dtype = numpy.float64 if double else numpy.float32 a = get_test_array(shape1, dtype) b = get_test_array(shape2, dtype) a_dev = ctx.to_device(a) b_dev = ctx.to_device(b) res_ref = ref_dot(a, b) res_dev = ctx.allocate(res_ref.shape, dtype=dtype) try: dot = MatrixMul(ctx).prepare_for(res_dev, a_dev, b_dev, block_width_override=bwo) except ValueError: pytest.skip() attempts = 10 t1 = time.time() for i in range(attempts): dot(res_dev, a_dev, b_dev) ctx.synchronize() t2 = time.time() assert diff_is_negligible(ctx.from_device(res_dev), res_ref) return (t2 - t1) / attempts, product(res_ref.shape) * shape1[-1] * 2
def test_sizes(ctx_with_gs_limits, gl_size, gs_is_multiple): """ Test that virtual sizes are correct. """ ctx = ctx_with_gs_limits grid_size, local_size = gl_size ref = ReferenceIds(grid_size, local_size, gs_is_multiple) get_sizes = ctx.compile_static(""" KERNEL void get_sizes(GLOBAL_MEM int *sizes) { if (virtual_global_flat_id() > 0) return; for (int i = 0; i < 3; i++) { sizes[i] = virtual_local_size(i); sizes[i + 3] = virtual_num_groups(i); sizes[i + 6] = virtual_global_size(i); } sizes[9] = virtual_global_flat_size(); } """, 'get_sizes', ref.global_size, local_size=ref.local_size) sizes = ctx.allocate(10, numpy.int32) get_sizes(sizes) gls = list(ref.global_size) + [1] * (3 - len(ref.global_size)) ls = list(ref.local_size) + [1] * (3 - len(ref.local_size)) gs = [min_blocks(g, l) for g, l in zip(gls, ls)] ref_sizes = numpy.array(ls + gs + gls + [product(gls)]).astype(numpy.int32) assert diff_is_negligible(sizes.get(), ref_sizes)
def check_performance(ctx_and_double, shape_and_axes): ctx, double = ctx_and_double shape, axes = shape_and_axes dtype = numpy.complex128 if double else numpy.complex64 data = get_test_array(shape, dtype) data_dev = ctx.to_device(data) res_dev = ctx.empty_like(data_dev) fft = FFT(ctx).prepare_for(res_dev, data_dev, None, axes=axes) attempts = 10 t1 = time.time() for i in range(attempts): fft(res_dev, data_dev, -1) ctx.synchronize() t2 = time.time() dev_time = (t2 - t1) / attempts fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) assert diff_is_negligible(res_dev.get(), fwd_ref) return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5
def test_ids(ctx_with_gs_limits, gl_size, gs_is_multiple): """ Test that virtual IDs are correct for each thread. """ ctx = ctx_with_gs_limits grid_size, local_size = gl_size ref = ReferenceIds(grid_size, local_size, gs_is_multiple) get_ids = ctx.compile_static(""" KERNEL void get_ids(GLOBAL_MEM int *fid, GLOBAL_MEM int *lx, GLOBAL_MEM int *ly, GLOBAL_MEM int *lz, GLOBAL_MEM int *gx, GLOBAL_MEM int *gy, GLOBAL_MEM int *gz, GLOBAL_MEM int *glx, GLOBAL_MEM int *gly, GLOBAL_MEM int *glz) { VIRTUAL_SKIP_THREADS; const int i = virtual_global_flat_id(); fid[i] = i; lx[i] = virtual_local_id(0); ly[i] = virtual_local_id(1); lz[i] = virtual_local_id(2); gx[i] = virtual_group_id(0); gy[i] = virtual_group_id(1); gz[i] = virtual_group_id(2); glx[i] = virtual_global_id(0); gly[i] = virtual_global_id(1); glz[i] = virtual_global_id(2); } """, 'get_ids', ref.global_size, local_size=ref.local_size) fid = ctx.allocate(product(ref.np_global_size), numpy.int32) lx = ctx.allocate(ref.np_global_size, numpy.int32) ly = ctx.allocate(ref.np_global_size, numpy.int32) lz = ctx.allocate(ref.np_global_size, numpy.int32) gx = ctx.allocate(ref.np_global_size, numpy.int32) gy = ctx.allocate(ref.np_global_size, numpy.int32) gz = ctx.allocate(ref.np_global_size, numpy.int32) glx = ctx.allocate(ref.np_global_size, numpy.int32) gly = ctx.allocate(ref.np_global_size, numpy.int32) glz = ctx.allocate(ref.np_global_size, numpy.int32) get_ids(fid, lx, ly, lz, gx, gy, gz, glx, gly, glz) assert diff_is_negligible(fid.get(), ref.predict_global_flat_ids()) assert diff_is_negligible(lx.get(), ref.predict_local_ids(0)) assert diff_is_negligible(ly.get(), ref.predict_local_ids(1)) assert diff_is_negligible(lz.get(), ref.predict_local_ids(2)) assert diff_is_negligible(gx.get(), ref.predict_group_ids(0)) assert diff_is_negligible(gy.get(), ref.predict_group_ids(1)) assert diff_is_negligible(gz.get(), ref.predict_group_ids(2)) assert diff_is_negligible(glx.get(), ref.predict_global_ids(0)) assert diff_is_negligible(gly.get(), ref.predict_global_ids(1)) assert diff_is_negligible(glz.get(), ref.predict_global_ids(2))
def ref_dot(a, b): a_batch = product(a.shape[:-2]) b_batch = product(b.shape[:-2]) assert a_batch == b_batch or a_batch == 1 or b_batch == 1 a = a.reshape(a_batch, a.shape[-2], a.shape[-1]) b = b.reshape(b_batch, b.shape[-2], b.shape[-1]) out_batch = max(a_batch, b_batch) out_shape = (out_batch, a.shape[-2], b.shape[-1]) out_dtype = numpy.result_type(a.dtype, b.dtype) out = numpy.empty(out_shape, out_dtype) for i in range(out_batch): ai = 0 if a_batch == 1 else i bi = 0 if b_batch == 1 else i out[i] = numpy.dot(a[ai], b[bi]) if a_batch == b_batch == 1: out = out.reshape(out.shape[-2], out_shape[-1]) return out
def test_find_local_size(ctx_and_global_size): ctx, global_size = ctx_and_global_size """ Check that if None is passed as local_size, kernel can find some local_size to run with (not necessarily optimal). """ module = ctx.compile(""" KERNEL void test(GLOBAL_MEM int *dest) { const int i = get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(1) * get_global_size(0); dest[i] = i; } """) test = module.test dest_dev = ctx.allocate(global_size, numpy.int32) test(dest_dev, global_size=global_size) assert diff_is_negligible( dest_dev.get().ravel(), numpy.arange(product(global_size)).astype(numpy.int32))
def test_find_local_size(ctx_and_global_size): ctx, global_size = ctx_and_global_size """ Check that if None is passed as local_size, kernel can find some local_size to run with (not necessarily optimal). """ module = ctx.compile( """ KERNEL void test(GLOBAL_MEM int *dest) { const int i = get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(1) * get_global_size(0); dest[i] = i; } """) test = module.test dest_dev = ctx.allocate(global_size, numpy.int32) test(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get().ravel(), numpy.arange(product(global_size)).astype(numpy.int32))
def predict_global_flat_ids(self): return numpy.arange(product(self.np_global_size)).astype(numpy.int32)
def pytest_generate_tests(metafunc): perf_log_shapes = [ (4, ), (10, ), (13, ), # 1D (4, 4), (7, 7), (10, 10), # 2D (4, 4, 4), (5, 5, 7), (7, 7, 7) ] # 3D perf_mem_limit = 4 * 2**20 if 'shape_and_axes' in metafunc.funcargnames: shapes = [] for x in [3, 8, 9, 10, 11, 12, 13, 20]: shapes.append((2**x, )) for x, y in itertools.product([4, 7, 8, 10], [4, 7, 8, 10]): shapes.append((2**x, 2**y)) for x, y, z in itertools.product([4, 7, 10], [4, 7, 10], [4, 7, 10]): shapes.append((2**x, 2**y, 2**z)) batch_sizes = [1, 16, 128, 1024, 4096] mem_limit = 2**20 vals = [] ids = [] for shape, batch in itertools.product(shapes, batch_sizes): if product(shape) * batch <= mem_limit: if batch == 1: vals.append((shape, None)) else: vals.append( ((batch, ) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('shape_and_axes', vals, ids=ids) elif 'non2batch_shape_and_axes' in metafunc.funcargnames: def idgen(shape_and_axes): shape, axes = shape_and_axes assert len(axes) == 1 outer_batch = shape[:axes[0]] inner_batch = shape[axes[0] + 1:] return ((str(outer_batch) + "x") if len(outer_batch) > 0 else "") + \ str(shape[axes[0]]) + "x" + str(inner_batch) vals = [((17, 16), (1, )), ((177, 256), (1, )), ((39, 16, 7), (1, )), ((17, 16, 131), (1, )), ((7, 1024, 11), (1, )), ((5, 1024, 57), (1, ))] metafunc.parametrize('non2batch_shape_and_axes', vals, ids=list(map(idgen, vals))) elif 'non2problem_shape_and_axes' in metafunc.funcargnames: def idgen(non2problem_shape_and_axes): shape, axes = non2problem_shape_and_axes return str(shape) + 'over' + str(axes) vals = [((17, 15), (1, )), ((17, 17), (1, )), ((19, 4095), (1, )), ((19, 4097), (1, )), ((39, 31, 7), (1, )), ((39, 33, 7), (1, )), ((3, 255, 7), (1, )), ((3, 257, 7), (1, )), ((17, 200, 131), (0, 1)), ((7, 1000, 11), (1, 2)), ((15, 900, 57), (0, 1, 2))] metafunc.parametrize('non2problem_shape_and_axes', vals, ids=list(map(idgen, vals))) elif 'perf_shape_and_axes' in metafunc.funcargnames: vals = [] ids = [] for log_shape in perf_log_shapes: shape = tuple(2**x for x in log_shape) batch = perf_mem_limit // (2**sum(log_shape)) vals.append(((batch, ) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('perf_shape_and_axes', vals, ids=ids) elif 'non2problem_perf_shape_and_axes' in metafunc.funcargnames: vals = [] ids = [] for log_shape in perf_log_shapes: for modifier in (1, -1): shape = tuple(2**(x - 1) + modifier for x in log_shape) batch = perf_mem_limit // (2**sum(log_shape)) vals.append(((batch, ) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('non2problem_perf_shape_and_axes', vals, ids=ids)
def pytest_generate_tests(metafunc): perf_log_shapes = [ (4,), (10,), (13,), # 1D (4, 4), (7, 7), (10, 10), # 2D (4, 4, 4), (5, 5, 7), (7, 7, 7)] # 3D perf_mem_limit = 4 * 2**20 if 'shape_and_axes' in metafunc.funcargnames: shapes = [] for x in [3, 8, 9, 10, 11, 12, 13, 20]: shapes.append((2 ** x,)) for x, y in itertools.product([4, 7, 8, 10], [4, 7, 8, 10]): shapes.append((2 ** x, 2 ** y)) for x, y, z in itertools.product([4, 7, 10], [4, 7, 10], [4, 7, 10]): shapes.append((2 ** x, 2 ** y, 2 ** z)) batch_sizes = [1, 16, 128, 1024, 4096] mem_limit = 2 ** 20 vals = [] ids = [] for shape, batch in itertools.product(shapes, batch_sizes): if product(shape) * batch <= mem_limit: if batch == 1: vals.append((shape, None)) else: vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('shape_and_axes', vals, ids=ids) elif 'non2batch_shape_and_axes' in metafunc.funcargnames: def idgen(shape_and_axes): shape, axes = shape_and_axes assert len(axes) == 1 outer_batch = shape[:axes[0]] inner_batch = shape[axes[0]+1:] return ((str(outer_batch) + "x") if len(outer_batch) > 0 else "") + \ str(shape[axes[0]]) + "x" + str(inner_batch) vals = [ ((17, 16), (1,)), ((177, 256), (1,)), ((39, 16, 7), (1,)), ((17, 16, 131), (1,)), ((7, 1024, 11), (1,)), ((5, 1024, 57), (1,))] metafunc.parametrize('non2batch_shape_and_axes', vals, ids=list(map(idgen, vals))) elif 'non2problem_shape_and_axes' in metafunc.funcargnames: def idgen(non2problem_shape_and_axes): shape, axes = non2problem_shape_and_axes return str(shape) + 'over' + str(axes) vals = [ ((17, 15), (1,)), ((17, 17), (1,)), ((19, 4095), (1,)), ((19, 4097), (1,)), ((39, 31, 7), (1,)), ((39, 33, 7), (1,)), ((3, 255, 7), (1,)), ((3, 257, 7), (1,)), ((17, 200, 131), (0, 1)), ((7, 1000, 11), (1, 2)), ((15, 900, 57), (0, 1, 2))] metafunc.parametrize('non2problem_shape_and_axes', vals, ids=list(map(idgen, vals))) elif 'perf_shape_and_axes' in metafunc.funcargnames: vals = [] ids = [] for log_shape in perf_log_shapes: shape = tuple(2 ** x for x in log_shape) batch = perf_mem_limit // (2 ** sum(log_shape)) vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('perf_shape_and_axes', vals, ids=ids) elif 'non2problem_perf_shape_and_axes' in metafunc.funcargnames: vals = [] ids = [] for log_shape in perf_log_shapes: for modifier in (1, -1): shape = tuple(2 ** (x - 1) + modifier for x in log_shape) batch = perf_mem_limit // (2 ** sum(log_shape)) vals.append(((batch,) + shape, tuple(range(1, len(shape) + 1)))) ids.append(str(batch) + "x" + str(shape)) metafunc.parametrize('non2problem_perf_shape_and_axes', vals, ids=ids)
def set_shape(self, shape): self._shape = shape if shape is None: self._size = None else: self._size = product(shape)