def beam_cube_dde(beam, beam_lm_ext, beam_freq_map, lm, parangles, pointing_errors, antenna_scaling, frequencies): corrs = beam.shape[3:] if beam.shape[2] >= BEAM_NUD_LIMIT: raise ValueError("beam_nud exceeds %d" % BEAM_NUD_LIMIT) nsrc = lm.shape[0] ntime, na = parangles.shape nchan = frequencies.shape[0] ncorr = reduce(mul, corrs, 1) nchancorr = nchan*ncorr oshape = (nsrc, ntime, na, nchan) + corrs if len(corrs) > 1: # Flatten the beam correlation dims fbeam = beam.reshape(beam.shape[:3] + (ncorr,)) else: fbeam = beam # Generate frequency interpolation kernel ikernel, iblock, idt = _generate_interp_kernel(beam_freq_map, frequencies) # Generate main beam cube kernel kernel, block, dtype = _generate_main_kernel(fbeam, beam_lm_ext, beam_freq_map, lm, parangles, pointing_errors, antenna_scaling, frequencies, len(oshape), ncorr) # Call frequency interpolation kernel igrid = grids((nchan, 1, 1), iblock) freq_data = cp.empty((3, nchan), dtype=frequencies.dtype) try: ikernel(igrid, iblock, (frequencies, beam_freq_map, freq_data)) except CompileException: log.exception(format_code(ikernel.code)) raise # Call main beam cube kernel out = cp.empty((nsrc, ntime, na, nchan) + (ncorr,), dtype=beam.dtype) grid = grids((nchancorr, na, ntime), block) try: kernel(grid, block, (fbeam, beam_lm_ext, beam_freq_map, lm, parangles, pointing_errors, antenna_scaling, frequencies, freq_data, nsrc, out)) except CompileException: log.exception(format_code(kernel.code)) raise return out.reshape(oshape)
def convert(inputs, input_schema, output_schema): (kernel, block, in_shape, out_shape, dtype) = _generate_kernel(inputs, input_schema, output_schema) # Flatten non-schema input dimensions, # from inspection of the cupy reshape code, # this incurs a copy when inputs is non-contiguous nsrc = reduce(mul, inputs.shape[:-len(in_shape)], 1) nelems = reduce(mul, in_shape, 1) rinputs = inputs.reshape(nsrc, nelems) assert rinputs.flags.c_contiguous grid = grids((nsrc, 1, 1), block) outputs = cp.empty(shape=rinputs.shape, dtype=dtype) try: kernel(grid, block, (rinputs, outputs)) except CompileException: log.exception(format_code(kernel.code)) raise shape = inputs.shape[:-len(in_shape)] + out_shape outputs = outputs.reshape(shape) assert outputs.flags.c_contiguous return outputs
def test_cuda_inplace_warp_transpose(ncorrs, dtype, nvis, debug): cp = pytest.importorskip('cupy') path = pjoin("rime", "cuda", "tests", "test_warp_transpose.cu.j2") render = jinja_env.get_template(path).render dtypes = { np.float32: 'float', np.float64: 'double', np.int32: 'int', np.complex64: 'float2', np.complex128: 'double2', } code = render(type=dtypes[dtype], warp_size=32, corrs=ncorrs, debug=debug).encode("utf-8") kernel = cp.RawKernel(code, "kernel") inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs) outputs = cp.empty_like(inputs) args = (inputs, outputs) block = (256, 1, 1) grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block)) try: kernel(grid, block, args) except cp.cuda.compiler.CompileException: print(format_code(kernel.code)) raise np.testing.assert_array_almost_equal(cp.asnumpy(inputs), cp.asnumpy(outputs))
def compile_using_nvcc(source, options=None, arch=None, filename='kern.cu'): options = options or [] if arch is None: cuda_info = get_cuda_info() arch = min( [dev['major'] * 10 + dev['minor'] for dev in cuda_info['devices']]) cc = get_compiler() settings = get_compiler_setting() arch = "--generate-code=arch=compute_{a},code=sm_{a}".format(a=arch) options += ['-cubin'] cupy_path = resource_filename("cupy", pjoin("core", "include")) settings['include_dirs'].append(cupy_path) with _tempdir() as tmpdir: tmpfile = pjoin(tmpdir, filename) with open(tmpfile, "w") as f: f.write(source) try: stderr_file = pjoin(tmpdir, "stderr.txt") with stdchannel_redirected(sys.stderr, stderr_file): objects = cc.compile([tmpfile], include_dirs=settings['include_dirs'], macros=settings['define_macros'], extra_postargs=options) except errors.CompileError as e: with open(stderr_file, "r") as f: errs = f.read() lines = [ "The following source code", format_code(source), "", "created the following compilation errors", "", errs.strip(), str(e).strip() ] ex = errors.CompileError("\n".join(lines)) raise (ex, None, sys.exc_info()[2]) assert len(objects) == 1 mod = cp.cuda.function.Module() mod.load_file(objects[0]) return mod
def phase_delay(lm, uvw, frequency): kernel, block, out_dtype = _generate_kernel(lm, uvw, frequency) grid = grids((frequency.shape[0], uvw.shape[0], 1), block) out = cp.empty(shape=(lm.shape[0], uvw.shape[0], frequency.shape[0]), dtype=out_dtype) try: kernel(grid, block, (lm, uvw, frequency, out)) except CompileException: log.exception(format_code(kernel.code)) raise return out
def feed_rotation(parallactic_angles, feed_type='linear'): """ Cupy implementation of the feed_rotation kernel. """ kernel, block, out_dtype = _generate_kernel(parallactic_angles, feed_type) in_shape = parallactic_angles.shape parallactic_angles = parallactic_angles.ravel() grid = grids((parallactic_angles.shape[0], 1, 1), block) out = cp.empty(shape=(parallactic_angles.shape[0], 4), dtype=out_dtype) try: kernel(grid, block, (parallactic_angles, out)) except CompileException: log.exception(format_code(kernel.code)) raise return out.reshape(in_shape + (2, 2))
def test_cuda_shuffle_transpose_2(ncorrs): cp = pytest.importorskip("cupy") jinja2 = pytest.importorskip("jinja2") # Implement a warp transpose using Kepler's register shuffle instructions # as described in del Mundo's # `Towards a performance-portable FFT library for heterogeneous computing` # https://doi.org/10.1145/2597917.2597943 # https://homes.cs.washington.edu/~cdel/papers/cf14-fft.pdf # The poster is especially informative # https://homes.cs.washington.edu/~cdel/posters/073113-on-efficacy-shuffle-sc2013.pdf # and # `Enabling Efficient Intra-Warp Communication for # Fourier Transforms in a Many-Core Architecture.` # https://homes.cs.washington.edu/~cdel/papers/sc13-shuffle-abstract.pdf # http://sc13.supercomputing.org/sites/default/files/PostersArchive/spost142.html _TEMPLATE = jinja2.Template(""" #include <cupy/carray.cuh> {%- if (corrs < 1 or (corrs.__and__(corrs - 1) != 0)) %} {{ throw("corrs must be 1 or a power of 2") }} {%- endif %} {% macro warp_transpose(var_name, var_type, var_length, tmp_name="tmp") %} {% if var_length > 1 %} { int mask = __activemask(); int case_id = threadIdx.x & {{var_length - 1}}; {{var_type}} {{tmp_name}}; // For variable swaps // Horizontal (inter-thread) Rotation int addr = case_id; {%- for case in range(var_length) %} {{var_name}}[{{case}}] = __shfl_sync(mask, {{var_name}}[{{case}}], addr, {{var_length}}); {%- if not loop.last %} addr = __shfl_sync(mask, addr, (case_id + 1) & {{var_length - 1}}, {{var_length}}); {%- endif %} {%- endfor %} // Vertical (intra-thread) Rotation {%- for case in range(var_length) %} // Case {{case}} {%- set cycles = register_assign_cycles(corrs, case) %} {%- for cycle in cycles %} {%- set cstart = cycle[0][0] %} {{tmp_name}} = {{var_name}}[{{cstart}}]; {%- for dest, src in cycle %} {%- set src_var = tmp_name if cstart == src else var_name + "[" + src|string + "]" %} {{var_name}}[{{dest}}] = case_id == {{case}} ? {{src_var}} : {{var_name}}[{{dest}}]; {%- endfor %} {%- endfor %} {%- endfor %} // Horizontal (inter-thread) Rotation addr = ({{var_length}} - case_id) & {{var_length - 1}}; {%- for case in range(var_length) %} {{var_name}}[{{case}}] = __shfl_sync(mask, {{var_name}}[{{case}}], addr, {{var_length}}); {%- if not loop.last %} addr = __shfl_sync(mask, addr, (case_id + {{var_length - 1}}) & {{var_length - 1}}, {{var_length}}); {%- endif %} {%- endfor %} } {%- endif %} {%- endmacro %} {%- set width = corrs %} extern "C" __global__ void kernel( const CArray<{{type}}, 2> input, CArray<{{type}}, 2> output) { const ptrdiff_t & nvis = input.shape()[0]; int v = blockIdx.x*blockDim.x + threadIdx.x; if(v >= nvis) { return; } // Array to hold our variables {{type}} values[{{corrs}}]; {% for corr in range(corrs) %} values[{{corr}}] = input[v + {{corr}}*nvis]; {%- endfor %} if({{debug}}) { if(threadIdx.x == 0) { printf("mask %d\\n", __activemask()); } printf("[%d] %d %d %d %d\\n", threadIdx.x & {{warp_size - 1}}, values[0], values[1], values[2], values[3]); if(threadIdx.x == 0) { printf("\\n"); } } {{ warp_transpose("values", type, corrs) }} {{ warp_transpose("values", type, corrs) }} if({{debug}}) { if(threadIdx.x == 0) { printf("\\n"); } printf("[%d] %d %d %d %d\\n", threadIdx.x & {{warp_size - 1}}, values[0], values[1], values[2], values[3]); } {% for corr in range(corrs) %} output[v + {{corr}}*nvis] = values[{{corr}}]; {%- endfor %} } """) # noqa nvis = 32 dtype = np.int32 dtypes = { np.float32: 'float', np.float64: 'double', np.int32: 'int', } code = _TEMPLATE.render(type=dtypes[dtype], throw=throw_helper, register_assign_cycles=register_assign_cycles, warp_size=32, corrs=ncorrs, debug="false").encode("utf-8") kernel = cp.RawKernel(code, "kernel") inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs) outputs = cp.empty_like(inputs) args = (inputs, outputs) block = (256, 1, 1) grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block)) try: kernel(grid, block, args) except cp.cuda.compiler.CompileException: print(format_code(kernel.code)) raise np.testing.assert_array_almost_equal(cp.asnumpy(inputs), cp.asnumpy(outputs)) return # Dead code print(grid, block) print("\n") print(inputs) print(outputs)
def test_cuda_shuffle_transpose(): cp = pytest.importorskip("cupy") jinja2 = pytest.importorskip("jinja2") _TEMPLATE = jinja2.Template(""" #include <cupy/carray.cuh> #define debug {{debug}} extern "C" __global__ void kernel( const CArray<{{type}}, 2> input, CArray<{{type}}, 2> output) { const ptrdiff_t & nvis = input.shape()[0]; int v = blockIdx.x*blockDim.x + threadIdx.x; int lane_id = threadIdx.x & ({{warp_size}} - 1); if(v >= nvis) { return; } // Input correlation handled by this thread int mask = __activemask(); {{type}} loads[{{corrs}}]; {{type}} values[{{corrs}}]; {% for corr in range(corrs) %} loads[{{corr}}] = input[v + {{corr}}*nvis]; {%- endfor %} __syncthreads(); if(debug) { if(threadIdx.x == 0) { printf("mask %d\\n", mask); } printf("[%d] %d %d %d %d\\n", lane_id, loads[0], loads[1], loads[2], loads[3]); if(threadIdx.x == 0) { printf("\\n"); } } // Tranpose forward #pragma unroll ({{corrs}}) for(int corr=0; corr < {{corrs}}; ++corr) { int src_corr = ({{corrs}} - corr + lane_id) % {{corrs}}; int dest_corr = (lane_id + corr) % {{corrs}}; int src_lane = (lane_id / {{corrs}})*{{corrs}} + dest_corr; values[dest_corr] = __shfl_sync(mask, loads[src_corr], src_lane, {{warp_size}}); } // Copy #pragma unroll ({{corrs}}) for(int corr=0; corr < {{corrs}}; ++corr) { loads[corr] = values[corr]; } // Transpose backward #pragma unroll ({{corrs}}) for(int corr=0; corr < {{corrs}}; ++corr) { int src_corr = ({{corrs}} - corr + lane_id) % {{corrs}}; int dest_corr = (lane_id + corr) % {{corrs}}; int src_lane = (lane_id / {{corrs}})*{{corrs}} + dest_corr; values[dest_corr] = __shfl_sync(mask, loads[src_corr], src_lane, {{warp_size}}); } __syncthreads(); if(debug) { if(threadIdx.x == 0) { printf("\\n"); } printf("[%d] %d %d %d %d\\n", lane_id, values[0], values[1], values[2], values[3]); } {% for corr in range(corrs) %} output[v + {{corr}}*nvis] = values[{{corr}}]; {%- endfor %} } """) nvis = 32 ncorrs = 4 dtype = np.int32 dtypes = { np.float32: 'float', np.float64: 'double', np.int32: 'int', } code = _TEMPLATE.render(type=dtypes[dtype], warp_size=32, corrs=ncorrs, debug="false").encode("utf-8") kernel = cp.RawKernel(code, "kernel") inputs = cp.arange(nvis * ncorrs, dtype=dtype).reshape(nvis, ncorrs) outputs = cp.empty_like(inputs) args = (inputs, outputs) block = (256, 1, 1) grid = tuple((d + b - 1) // b for d, b in zip((nvis, 1, 1), block)) try: kernel(grid, block, args) except cp.cuda.compiler.CompileException: print(format_code(kernel.code)) raise np.testing.assert_array_almost_equal(cp.asnumpy(inputs), cp.asnumpy(outputs)) return # Dead code print(grid, block) print("\n") print(inputs) print(outputs)
def predict_vis(time_index, antenna1, antenna2, dde1_jones=None, source_coh=None, dde2_jones=None, die1_jones=None, base_vis=None, die2_jones=None): """ Cupy implementation of the feed_rotation kernel. """ have_ddes = dde1_jones is not None and dde2_jones is not None have_dies = die1_jones is not None and die2_jones is not None have_coh = source_coh is not None have_bvis = base_vis is not None # Infer the output shape if have_ddes: row = time_index.shape[0] chan = dde1_jones.shape[3] corrs = dde1_jones.shape[4:] elif have_coh: row = time_index.shape[0] chan = source_coh.shape[2] corrs = source_coh.shape[3:] elif have_dies: row = time_index.shape[0] chan = die1_jones.shape[2] corrs = die1_jones.shape[3:] elif have_bvis: row = time_index.shape[0] chan = base_vis.shape[1] corrs = base_vis.shape[2:] else: raise ValueError("Insufficient inputs supplied for determining " "the output shape") ncorrs = len(corrs) # Flatten correlations if ncorrs == 2: flat_corrs = reduce(mul, corrs, 1) if have_ddes: dde_shape = dde1_jones.shape[:-ncorrs] + (flat_corrs, ) dde1_jones = dde1_jones.reshape(dde_shape) dde2_jones = dde2_jones.reshape(dde_shape) if have_coh: coh_shape = source_coh.shape[:-ncorrs] + (flat_corrs, ) source_coh = source_coh.reshape(coh_shape) if have_dies: die_shape = die1_jones.shape[:-ncorrs] + (flat_corrs, ) die1_jones = die1_jones.reshape(die_shape) die2_jones = die2_jones.reshape(die_shape) if have_bvis: bvis_shape = base_vis.shape[:-ncorrs] + (flat_corrs, ) base_vis = base_vis.reshape(bvis_shape) elif ncorrs == 1: flat_corrs = corrs[0] else: raise ValueError("Invalid correlation setup %s" % (corrs, )) out_shape = (row, chan) + (flat_corrs, ) kernel, block, out_dtype = _generate_kernel(time_index, antenna1, antenna2, dde1_jones, source_coh, dde2_jones, die1_jones, base_vis, die2_jones, corrs, len(out_shape)) grid = grids((chan * flat_corrs, row, 1), block) out = cp.empty(shape=out_shape, dtype=out_dtype) # Normalise the time index # TODO(sjperkins) # Normalise the time index with a device-wide reduction norm_time_index = time_index - time_index.min() args = (norm_time_index, antenna1, antenna2, dde1_jones, source_coh, dde2_jones, die1_jones, base_vis, die2_jones, out) try: kernel(grid, block, tuple(a for a in args if a is not None)) except CompileException: log.exception(format_code(kernel.code)) raise return out.reshape((row, chan) + corrs)