def gather_access_footprint_bytes(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array *var_name* :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.kernel import kernel_state if kernel.state < kernel_state.PREPROCESSED: kernel = preprocess_kernel(kernel) result = {} fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) for key, var_fp in fp.items(): vname, direction = key var_descr = kernel.get_var_descriptor(vname) bytes_transferred = ( int(var_descr.dtype.numpy_dtype.itemsize) * count(kernel, var_fp)) if key in result: result[key] += bytes_transferred else: result[key] = bytes_transferred return result
def get_typed_and_scheduled_kernel(self, var_to_dtype_set): kernel = self.kernel from loopy.kernel.tools import add_dtypes if var_to_dtype_set: var_to_dtype = {} for var, dtype in var_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError("cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The string specifies the operation type as *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the :class:`loopy.LoopKernel` *parameter inames*). Example usage:: # (first create loopy kernel and specify array data types) poly = get_op_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) op_poly = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) result = op_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind), count) for (dtype, kind), count in six.iteritems(result)) return result
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn( kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls made (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] barrier_poly = isl.PwQPolynomial('{ 0 }') for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) barrier_poly += reduce(mul, ct) else: barrier_poly += isl.PwQPolynomial('{ 1 }') return barrier_poly
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn(kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def get_op_poly(knl): from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) op_poly = 0 op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) return op_poly
def estimate_regs_per_thread(knl): """Estimate registers per thread usage by a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated. :return: An :class:`integer` holding an estimate for the number of registers used per thread. This number will most likely be too low, but will hopefully be consistantly too low by the same constant factor. """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction # noqa knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) max_regs = 0 block_reg_totals = [0] # counters to track nested sets of previously used iname+index combinations reg_counters = [RegisterUsageEstimator(knl)] for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): block_reg_totals.append(0) # start a new estimator reg_counters.append(RegisterUsageEstimator(knl)) elif isinstance(sched_item, LeaveLoop): if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] # pop to resume previous total block_reg_totals.pop() reg_counters.pop() elif isinstance(sched_item, RunInstruction): insn = knl.id_to_insn[sched_item.insn_id] block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \ reg_counters[-1](insn.expression) # finished looping, check outer block if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] return max_regs
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): ctx = ctx_factory() pytest.xfail("spilling doesn't yet use local axes") knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ <> t_private_scalar = a[k,i+1] <> t_private_array[i % 2] = a[k,i+1] c[k,i] = a[k,i+1] out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") knl = lp.set_temporary_scope(knl, "t_private_array", "private") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, (knl.get_inames_domain(iname_list).project_out_except( iname_list, [dim_type.set]))), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): pass cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_inames, uniform=False): if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [ iname for iname in insn_inames if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag) ] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) return count(knl, domain) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap( dict((key + ("load", ), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap( dict((key + ("store", ), val) for key, val in six.iteritems(subs_assignee.dict))) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly * get_insn_count( knl, insn_inames, True) else: subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly * get_insn_count( knl, insn_inames, True) else: subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames) result = subs_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind, direction), count) for (dtype, kind, direction), count in six.iteritems(result)) return result
""" for i, k ... gbarrier c[k,i] = a[k, i + 1] ... gbarrier out[k,i] = c[k,i] end """, seq_dependencies=True) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table)) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) print(cgr.device_code()) print(cgr.host_code())
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): pass cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_inames, uniform=False): if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) return count(knl, domain) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap(dict( (key + ("load",), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap(dict( (key + ("store",), val) for key, val in six.iteritems(subs_assignee.dict))) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) result = subs_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind, direction), count) for (dtype, kind, direction), count in six.iteritems(result)) return result
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes, codegen_state=codegen_state ) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} # For faster unpickling in the common case when implemented_domains isn't needed. from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result
def get_gmem_access_poly(knl): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The :class:`numpy.dtype` specifies the type of the data being accessed. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap(dict( (key + ("load",), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap(dict( (key + ("store",), val) for key, val in six.iteritems(subs_assignee.dict))) subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain) return subs_poly.dict
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result
knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ for i, k ... gbarrier c[k,i] = a[k, i + 1] ... gbarrier out[k,i] = c[k,i] end """, seq_dependencies=True) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) print(cgr.device_code()) print(cgr.host_code())