Пример #1
0
def _get_simple_elementwise_kernel(params,
                                   operation,
                                   name,
                                   preamble,
                                   loop_prep='',
                                   after_loop='',
                                   options=()):
    module_code = string.Template('''
    ${preamble}
    extern "C" __global__ void ${name}(${params}) {
      ${loop_prep};
      CUPY_FOR(i, _ind.size()) {
        _ind.set(i);
        ${operation};
      }
      ${after_loop};
    }
    ''').substitute(params=params,
                    operation=operation,
                    name=name,
                    preamble=preamble,
                    loop_prep=loop_prep,
                    after_loop=after_loop)
    module = carray.compile_with_cache(module_code, options)
    return module.get_function(name)
Пример #2
0
def _get_simple_elementwise_kernel(
        params, operation, name, preamble,
        loop_prep='', after_loop='', options=()):
    module_code = string.Template('''
    ${preamble}
    extern "C" __global__ void ${name}(${params}) {
      ${loop_prep};
      CUPY_FOR(i, _ind.size()) {
        _ind.set(i);
        ${operation};
      }
      ${after_loop};
    }
    ''').substitute(
        params=params,
        operation=operation,
        name=name,
        preamble=preamble,
        loop_prep=loop_prep,
        after_loop=after_loop)
    module = carray.compile_with_cache(module_code, options)
    return module.get_function(name)
Пример #3
0
def _get_simple_reduction_kernel(
        name, block_size, reduce_type, params, identity,
        pre_map_expr, reduce_expr, post_map_expr,
        type_preamble, input_expr, output_expr, preamble, options):
    if identity is None:
        identity = ''
    module_code = string.Template('''
    ${type_preamble}
    ${preamble}
    #define REDUCE(a, b) (${reduce_expr})
    #define POST_MAP(a) (${post_map_expr})

    typedef ${reduce_type} _type_reduce;
    extern "C" __global__ void ${name}(${params}) {
      if (_out_clp2_size > 256) {
        CUPY_FOR(_i, _out_ind.size()) {
          _type_reduce _s = _type_reduce(${identity});
          for (int _j = _i, _J = 0;
               _j < _in_ind.size();
               _j += _out_ind.size(), _J++) {
            _in_ind.set(_j);
            ${input_expr}
            _type_reduce _a = ${pre_map_expr};
            _s = REDUCE(_s, _a);
          }
          _out_ind.set(_i);
          ${output_expr}
          POST_MAP(_s);
        }
      } else {
        extern __shared__ _type_reduce _sdata_raw[];
        _type_reduce *_sdata = _sdata_raw;
        int _tid = threadIdx.x;
        _sdata[_tid] = _type_reduce(${identity});
        unsigned int _i = _tid % _out_clp2_size;
        if (_i >= _out_ind.size()) return;
        _type_reduce _s = _type_reduce(${identity});
        int _J_offset = _tid / _out_clp2_size;
        int _j_offset = _J_offset * _out_ind.size();
        int _J_stride = ${block_size} / _out_clp2_size;
        int _j_stride = _J_stride * _out_ind.size();
        for (int _j = _i + _j_offset, _J = _J_offset;
             _j < _in_ind.size();
             _j += _j_stride, _J += _J_stride) {
          _in_ind.set(_j);
          ${input_expr}
          _type_reduce _a = ${pre_map_expr};
          _s = REDUCE(_s, _a);
        }
        _sdata[_tid] = _s;
        __syncthreads();
        if (_tid >= 256) return;
        _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 256]);
        __syncthreads();
        if (_out_clp2_size <= 128) {
          _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 128]);
          __syncthreads();
          if (_out_clp2_size <= 64) {
            _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 64]);
            __syncthreads();
            if (_out_clp2_size <= 32) {
              _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 32]);
              if (_out_clp2_size <= 16) {
                _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 16]);
                if (_out_clp2_size <= 8) {
                  _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 8]);
                  if (_out_clp2_size <= 4) {
                    _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 4]);
                    if (_out_clp2_size <= 2) {
                      _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 2]);
                      if (_out_clp2_size <= 1) {
                        _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 1]);
                      }
                    }
                  }
                }
              }
            }
          }
        }
        _s = _sdata[_tid];
        if (_tid >= _out_ind.size()) return;
        _out_ind.set(_i);
         ${output_expr}
        POST_MAP(_s);
      }
    }''').substitute(
        name=name,
        block_size=block_size,
        reduce_type=reduce_type,
        params=params,
        identity=identity,
        reduce_expr=reduce_expr,
        pre_map_expr=pre_map_expr,
        post_map_expr=post_map_expr,
        type_preamble=type_preamble,
        input_expr=input_expr,
        output_expr=output_expr,
        preamble=preamble)
    module = carray.compile_with_cache(module_code, options)
    return module.get_function(name)
Пример #4
0
def _get_simple_reduction_kernel(name, block_size, reduce_type, params,
                                 identity, pre_map_expr, reduce_expr,
                                 post_map_expr, type_preamble, input_expr,
                                 output_expr, preamble, options):
    if identity is None:
        identity = ''
    module_code = string.Template('''
    ${type_preamble}
    ${preamble}
    #define REDUCE(a, b) (${reduce_expr})
    #define POST_MAP(a) (${post_map_expr})

    typedef ${reduce_type} _type_reduce;
    extern "C" __global__ void ${name}(${params}) {
      if (_out_clp2_size > 256) {
        CUPY_FOR(_i, _out_ind.size()) {
          _type_reduce _s = _type_reduce(${identity});
          for (int _j = _i, _J = 0;
               _j < _in_ind.size();
               _j += _out_ind.size(), _J++) {
            _in_ind.set(_j);
            ${input_expr}
            _type_reduce _a = ${pre_map_expr};
            _s = REDUCE(_s, _a);
          }
          _out_ind.set(_i);
          ${output_expr}
          POST_MAP(_s);
        }
      } else {
        extern __shared__ _type_reduce _sdata_raw[];
        _type_reduce *_sdata = _sdata_raw;
        int _tid = threadIdx.x;
        _sdata[_tid] = _type_reduce(${identity});
        unsigned int _i = _tid % _out_clp2_size;
        if (_i >= _out_ind.size()) return;
        _type_reduce _s = _type_reduce(${identity});
        int _J_offset = _tid / _out_clp2_size;
        int _j_offset = _J_offset * _out_ind.size();
        int _J_stride = ${block_size} / _out_clp2_size;
        int _j_stride = _J_stride * _out_ind.size();
        for (int _j = _i + _j_offset, _J = _J_offset;
             _j < _in_ind.size();
             _j += _j_stride, _J += _J_stride) {
          _in_ind.set(_j);
          ${input_expr}
          _type_reduce _a = ${pre_map_expr};
          _s = REDUCE(_s, _a);
        }
        _sdata[_tid] = _s;
        __syncthreads();
        if (_tid >= 256) return;
        _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 256]);
        __syncthreads();
        if (_out_clp2_size <= 128) {
          _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 128]);
          __syncthreads();
          if (_out_clp2_size <= 64) {
            _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 64]);
            __syncthreads();
            if (_out_clp2_size <= 32) {
              _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 32]);
              if (_out_clp2_size <= 16) {
                _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 16]);
                if (_out_clp2_size <= 8) {
                  _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 8]);
                  if (_out_clp2_size <= 4) {
                    _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 4]);
                    if (_out_clp2_size <= 2) {
                      _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 2]);
                      if (_out_clp2_size <= 1) {
                        _sdata[_tid] = REDUCE(_sdata[_tid], _sdata[_tid + 1]);
                      }
                    }
                  }
                }
              }
            }
          }
        }
        _s = _sdata[_tid];
        if (_tid >= _out_ind.size()) return;
        _out_ind.set(_i);
         ${output_expr}
        POST_MAP(_s);
      }
    }''').substitute(name=name,
                     block_size=block_size,
                     reduce_type=reduce_type,
                     params=params,
                     identity=identity,
                     reduce_expr=reduce_expr,
                     pre_map_expr=pre_map_expr,
                     post_map_expr=post_map_expr,
                     type_preamble=type_preamble,
                     input_expr=input_expr,
                     output_expr=output_expr,
                     preamble=preamble)
    module = carray.compile_with_cache(module_code, options)
    return module.get_function(name)
Пример #5
0
def _get_simple_reduction_kernel(
        name, block_size, reduce_type, params, identity,
        pre_map_expr, reduce_expr, post_map_expr,
        type_preamble, input_expr, output_expr, preamble, options):
    if identity is None:
        identity = ''
    module_code = string.Template('''
    ${type_preamble}
    ${preamble}
    #define REDUCE(a, b) (${reduce_expr})
    #define POST_MAP(a) (${post_map_expr})
    #define _REDUCE(_offset) if (_tid < _offset) { \
      _type_reduce _a = _sdata[_tid], _b = _sdata[(_tid + _offset)]; \
      _sdata[_tid] = REDUCE(_a, _b); \
    }

    typedef ${reduce_type} _type_reduce;
    extern "C" __global__ void ${name}(${params}) {
      extern __shared__ _type_reduce _sdata_raw[];
      _type_reduce *_sdata = _sdata_raw;
      unsigned int _tid = threadIdx.x;

      int _J_offset = _tid / _block_stride;
      int _j_offset = _J_offset * _out_ind.size();
      int _J_stride = ${block_size};
      int _j_stride = ${block_size} * _out_ind.size();

      for (int _i_base = blockIdx.x * _block_stride;
           _i_base < _out_ind.size();
           _i_base += gridDim.x * _block_stride) {
        _type_reduce _s = _type_reduce(${identity});
        int _i = _i_base + _tid % _block_stride;
        for (int _j = _i + _j_offset, _J = _J_offset;
             _j < _in_ind.size();
             _j += _j_stride, _J += _J_stride) {
          _in_ind.set(_j);
          ${input_expr}
          _type_reduce _a = ${pre_map_expr};
          _s = REDUCE(_s, _a);
        }
        if (_block_stride < ${block_size}) {
          _sdata[_tid] = _s;
          __syncthreads();
          if (_block_stride <= 256) {
            _REDUCE(256);
            __syncthreads();
            if (_block_stride <= 128) {
              _REDUCE(128)
              __syncthreads();
              if (_block_stride <= 64) {
                _REDUCE(64)
                __syncthreads();
                if (_block_stride <= 32) {
                  _REDUCE(32)
                  if (_block_stride <= 16) {
                    _REDUCE(16)
                    if (_block_stride <= 8) {
                      _REDUCE(8)
                      if (_block_stride <= 4) {
                        _REDUCE(4)
                        if (_block_stride <= 2) {
                          _REDUCE(2)
                          if (_block_stride <= 1) {
                            _REDUCE(1)
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
          _s = _sdata[_tid];
          __syncthreads();
        }
        if (_J_offset == 0 && _i < _out_ind.size()) {
          _out_ind.set(_i);
          ${output_expr}
          POST_MAP(_s);
        }
      }
    }''').substitute(
        name=name,
        block_size=block_size,
        reduce_type=reduce_type,
        params=params,
        identity=identity,
        reduce_expr=reduce_expr,
        pre_map_expr=pre_map_expr,
        post_map_expr=post_map_expr,
        type_preamble=type_preamble,
        input_expr=input_expr,
        output_expr=output_expr,
        preamble=preamble)
    module = carray.compile_with_cache(module_code, options)
    return module.get_function(name)