def __init__(self, name, reduce_func, expr, in_param, out_param, axis): """Reduction operation. """ _fusion_thread_local.check_not_runtime() assert isinstance(name, str) assert isinstance(reduce_func, _reduction._SimpleReductionKernel) assert isinstance(in_param, _TraceArray) assert isinstance(out_param, _TraceArray) assert isinstance(axis, tuple) assert all(0 <= x < in_param.ndim for x in axis) self.name = name self.preamble = reduce_func.preamble self.in_params = _VariableSet(in_param) self.out_params = _VariableSet(out_param) self.block_stride_name = 'block_stride_' + name self.axis = axis if reduce_func.identity is None: self.identity = '' else: self.identity = str(reduce_func.identity) _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr if self.reduce_ctype is None: out_param, = self.out_params self.reduce_ctype = get_typename(out_param.dtype) self.premap_op = None self.postmap_op = None
def params(self): """Returns the set of all variable the loop uses. """ res = _VariableSet() for op in self.ops: res += _VariableSet(*op.in_params) res += _VariableSet(*op.out_params) return res
def __init__(self, ufunc_routines, in_params, out_params, ashape): # The `in_params` and `out_params` should be already broadcasted to # `ashape`, but they don't guarantee to be exactly same as # `param.ashape`. _fusion_thread_local.check_not_runtime() assert isinstance(ufunc_routines, list) assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines) assert isinstance(ashape, tuple) self.ops = ufunc_routines self.in_params = _VariableSet(*in_params) self.out_params = _VariableSet(*out_params) self.ashape = ashape
def _emit_after_operation(out_params): """Returns a tuple of size 2. 1. CUDA code: writing the results of operations back to global memory. 2. The set of arrays which require indexer. """ _fusion_thread_local.check_not_runtime() indexed_arrays = _VariableSet() codes = [] for var in out_params: if isinstance(var, _TraceArray): indexed_arrays.add(var) f = '${var}[${indexer}.get()] = ${lvar};' else: f = '${var} = ${lvar};' codes.append(var.format(f)) return codes, indexed_arrays
def _emit_declaration(params, in_params): """Returns a tuple of size 2. 1. CUDA code: declaring local variables. 2. The set of arrays which require indexer. """ _fusion_thread_local.check_not_runtime() indexed_arrays = _VariableSet() code = [] for var in params: if var in in_params: if isinstance(var, _TraceArray): indexed_arrays.add(var) f = '${type} ${lvar} = ${var}[${indexer}.get()];' else: f = '${type} ${lvar} = ${var};' else: f = '${type} ${lvar};' code.append(var.format(f)) return code, indexed_arrays
def _reduce_memory_access(ops): required_memories = set() for op in ops: for p in op.in_params + op.out_params: if p.memory.is_inout: required_memories.add(p.memory) for op in ops[::-1]: in_memories = set([p.memory for p in op.in_params]) new_out_params = [] for p in op.out_params: if p.memory in required_memories: new_out_params.append(p) op.out_params = _fusion_variable._VariableSet(*new_out_params) # TODO(asi1024): The following improvement can be applicable only # when the memory space is used at most once. # `required_memories -= out_memories` required_memories |= in_memories return [op for op in ops if len(op.out_params) > 0]