def _get_simple_elementwise_kernel(params, operation, name, preamble, loop_prep='', after_loop='', options=()): module_code = string.Template(''' ${preamble} extern "C" __global__ void ${name}(${params}) { ${loop_prep}; CUPY_FOR(i, _ind.size()) { _ind.set(i); ${operation}; } ${after_loop}; } ''').substitute(params=params, operation=operation, name=name, preamble=preamble, loop_prep=loop_prep, after_loop=after_loop) module = carray.compile_with_cache(module_code, options) return module.get_function(name)
def _get_simple_elementwise_kernel( params, operation, name, preamble, loop_prep='', after_loop='', options=()): module_code = string.Template(''' ${preamble} extern "C" __global__ void ${name}(${params}) { ${loop_prep}; CUPY_FOR(i, _ind.size()) { _ind.set(i); ${operation}; } ${after_loop}; } ''').substitute( params=params, operation=operation, name=name, preamble=preamble, loop_prep=loop_prep, after_loop=after_loop) module = carray.compile_with_cache(module_code, options) return module.get_function(name)
def _get_simple_reduction_kernel( name, block_size, reduce_type, params, identity, pre_map_expr, reduce_expr, post_map_expr, type_preamble, input_expr, output_expr, preamble, options): if identity is None: identity = '' module_code = string.Template(''' ${type_preamble} ${preamble} #define REDUCE(a, b) (${reduce_expr}) #define POST_MAP(a) (${post_map_expr}) typedef ${reduce_type} _type_reduce; extern "C" __global__ void ${name}(${params}) { if (_out_clp2_size > 256) { CUPY_FOR(_i, _out_ind.size()) { _type_reduce _s = _type_reduce(${identity}); for (int _j = _i, _J = 0; _j < _in_ind.size(); _j += _out_ind.size(), _J++) { _in_ind.set(_j); ${input_expr} _type_reduce _a = ${pre_map_expr}; _s = REDUCE(_s, _a); } _out_ind.set(_i); ${output_expr} POST_MAP(_s); } } else { extern __shared__ _type_reduce _sdata_raw[]; _type_reduce *_sdata = _sdata_raw; int _tid = threadIdx.x; _sdata[_tid] = _type_reduce(${identity}); unsigned int _i = _tid % _out_clp2_size; if (_i >= _out_ind.size()) return; _type_reduce _s = _type_reduce(${identity}); int _J_offset = _tid / _out_clp2_size; int _j_offset = _J_offset * _out_ind.size(); int _J_stride = ${block_size} / _out_clp2_size; int _j_stride = _J_stride * _out_ind.size(); for (int _j = _i + _j_offset, _J = _J_offset; _j < _in_ind.size(); _j += _j_stride, _J += _J_stride) { _in_ind.set(_j); ${input_expr} _type_reduce _a = ${pre_map_expr}; _s = REDUCE(_s, _a); } _sdata[_tid] = _s; __syncthreads(); if (_tid >= 256) return; _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 256]); __syncthreads(); if (_out_clp2_size <= 128) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 128]); __syncthreads(); if (_out_clp2_size <= 64) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 64]); __syncthreads(); if (_out_clp2_size <= 32) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 32]); if (_out_clp2_size <= 16) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 16]); if (_out_clp2_size <= 8) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 8]); if (_out_clp2_size <= 4) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 4]); if (_out_clp2_size <= 2) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 2]); if (_out_clp2_size <= 1) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 1]); } } } } } } } } _s = _sdata[_tid]; if (_tid >= _out_ind.size()) return; _out_ind.set(_i); ${output_expr} POST_MAP(_s); } }''').substitute( name=name, block_size=block_size, reduce_type=reduce_type, params=params, identity=identity, reduce_expr=reduce_expr, pre_map_expr=pre_map_expr, post_map_expr=post_map_expr, type_preamble=type_preamble, input_expr=input_expr, output_expr=output_expr, preamble=preamble) module = carray.compile_with_cache(module_code, options) return module.get_function(name)
def _get_simple_reduction_kernel(name, block_size, reduce_type, params, identity, pre_map_expr, reduce_expr, post_map_expr, type_preamble, input_expr, output_expr, preamble, options): if identity is None: identity = '' module_code = string.Template(''' ${type_preamble} ${preamble} #define REDUCE(a, b) (${reduce_expr}) #define POST_MAP(a) (${post_map_expr}) typedef ${reduce_type} _type_reduce; extern "C" __global__ void ${name}(${params}) { if (_out_clp2_size > 256) { CUPY_FOR(_i, _out_ind.size()) { _type_reduce _s = _type_reduce(${identity}); for (int _j = _i, _J = 0; _j < _in_ind.size(); _j += _out_ind.size(), _J++) { _in_ind.set(_j); ${input_expr} _type_reduce _a = ${pre_map_expr}; _s = REDUCE(_s, _a); } _out_ind.set(_i); ${output_expr} POST_MAP(_s); } } else { extern __shared__ _type_reduce _sdata_raw[]; _type_reduce *_sdata = _sdata_raw; int _tid = threadIdx.x; _sdata[_tid] = _type_reduce(${identity}); unsigned int _i = _tid % _out_clp2_size; if (_i >= _out_ind.size()) return; _type_reduce _s = _type_reduce(${identity}); int _J_offset = _tid / _out_clp2_size; int _j_offset = _J_offset * _out_ind.size(); int _J_stride = ${block_size} / _out_clp2_size; int _j_stride = _J_stride * _out_ind.size(); for (int _j = _i + _j_offset, _J = _J_offset; _j < _in_ind.size(); _j += _j_stride, _J += _J_stride) { _in_ind.set(_j); ${input_expr} _type_reduce _a = ${pre_map_expr}; _s = REDUCE(_s, _a); } _sdata[_tid] = _s; __syncthreads(); if (_tid >= 256) return; _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 256]); __syncthreads(); if (_out_clp2_size <= 128) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 128]); __syncthreads(); if (_out_clp2_size <= 64) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 64]); __syncthreads(); if (_out_clp2_size <= 32) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 32]); if (_out_clp2_size <= 16) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 16]); if (_out_clp2_size <= 8) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 8]); if (_out_clp2_size <= 4) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 4]); if (_out_clp2_size <= 2) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 2]); if (_out_clp2_size <= 1) { _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 1]); } } } } } } } } _s = _sdata[_tid]; if (_tid >= _out_ind.size()) return; _out_ind.set(_i); ${output_expr} POST_MAP(_s); } }''').substitute(name=name, block_size=block_size, reduce_type=reduce_type, params=params, identity=identity, reduce_expr=reduce_expr, pre_map_expr=pre_map_expr, post_map_expr=post_map_expr, type_preamble=type_preamble, input_expr=input_expr, output_expr=output_expr, preamble=preamble) module = carray.compile_with_cache(module_code, options) return module.get_function(name)
def _get_simple_reduction_kernel( name, block_size, reduce_type, params, identity, pre_map_expr, reduce_expr, post_map_expr, type_preamble, input_expr, output_expr, preamble, options): if identity is None: identity = '' module_code = string.Template(''' ${type_preamble} ${preamble} #define REDUCE(a, b) (${reduce_expr}) #define POST_MAP(a) (${post_map_expr}) #define _REDUCE(_offset) if (_tid < _offset) { \ _type_reduce _a = _sdata[_tid], _b = _sdata[(_tid + _offset)]; \ _sdata[_tid] = REDUCE(_a, _b); \ } typedef ${reduce_type} _type_reduce; extern "C" __global__ void ${name}(${params}) { extern __shared__ _type_reduce _sdata_raw[]; _type_reduce *_sdata = _sdata_raw; unsigned int _tid = threadIdx.x; int _J_offset = _tid / _block_stride; int _j_offset = _J_offset * _out_ind.size(); int _J_stride = ${block_size}; int _j_stride = ${block_size} * _out_ind.size(); for (int _i_base = blockIdx.x * _block_stride; _i_base < _out_ind.size(); _i_base += gridDim.x * _block_stride) { _type_reduce _s = _type_reduce(${identity}); int _i = _i_base + _tid % _block_stride; for (int _j = _i + _j_offset, _J = _J_offset; _j < _in_ind.size(); _j += _j_stride, _J += _J_stride) { _in_ind.set(_j); ${input_expr} _type_reduce _a = ${pre_map_expr}; _s = REDUCE(_s, _a); } if (_block_stride < ${block_size}) { _sdata[_tid] = _s; __syncthreads(); if (_block_stride <= 256) { _REDUCE(256); __syncthreads(); if (_block_stride <= 128) { _REDUCE(128) __syncthreads(); if (_block_stride <= 64) { _REDUCE(64) __syncthreads(); if (_block_stride <= 32) { _REDUCE(32) if (_block_stride <= 16) { _REDUCE(16) if (_block_stride <= 8) { _REDUCE(8) if (_block_stride <= 4) { _REDUCE(4) if (_block_stride <= 2) { _REDUCE(2) if (_block_stride <= 1) { _REDUCE(1) } } } } } } } } } _s = _sdata[_tid]; __syncthreads(); } if (_J_offset == 0 && _i < _out_ind.size()) { _out_ind.set(_i); ${output_expr} POST_MAP(_s); } } }''').substitute( name=name, block_size=block_size, reduce_type=reduce_type, params=params, identity=identity, reduce_expr=reduce_expr, pre_map_expr=pre_map_expr, post_map_expr=post_map_expr, type_preamble=type_preamble, input_expr=input_expr, output_expr=output_expr, preamble=preamble) module = carray.compile_with_cache(module_code, options) return module.get_function(name)