def get_tiling_space(kernel_desc, level=1, attr=None): """ get tiling space of composite kernel Args: kernel_desc : str of compute description level : info level attr : dict of build attributes Returns: Module. """ if attr is None: attr = {} attr['help_tiling'] = level func = tvm.get_global_func('composite_lower') ret = func(kernel_desc, attr) spaces = {} spaces['index'] = ret.index_table.asnumpy().tolist() spaces['l1_range'] = ret.l1_tile_range_table.asnumpy().tolist() spaces['l0_range'] = ret.l0_tile_range_table.asnumpy().tolist() spaces['l1_mod'] = ret.l1_tile_mod_table.asnumpy().tolist() spaces['l0_mod'] = ret.l0_tile_mod_table.asnumpy().tolist() if level >= 2: spaces['tuning_space'] = ret.tiling_candidate.asnumpy().tolist() return spaces
def _build_to_gpu_func(desc_s, desc_d, attr=None, poly=False): """ build kernel with compute description in json format Args: desc_s : str of compute description desc_d : dict of compute description attr : dict of build attributes Returns: Module. """ def get_repo(keys, default=None): repo = repository_gpu for key in keys: repo = repo.get(key) if not repo: return default return repo if attr is None: attr = {'dim': ''} compute, shape, dtype = generate_trait(desc_d) repo_attr = get_repo([compute, shape, dtype, 'metadata', 'attrs'], {}) if not repo_attr: repo_attr = get_repo([compute, 'metadata', 'attrs'], {}) for a in repo_attr: if not attr.get(a): attr[a] = repo_attr[a] attr_list = ['dim', 'bind_block', 'bind_thread'] for item in attr_list: if attr.get(item) in (None, ''): value = get_repo([compute, shape, dtype, item]) if value: attr[item] = value func = tvm.get_global_func("composite_with_json") return func(desc_s, attr, poly)
def execute(self): """ Execute the parser, get result data, and write it to the output file. Returns: bool, whether succeed to analyse hwts log. """ log_type = ['Start of task', 'End of task', 'Start of block', 'End of block', 'Block PMU'] result_data = "" self._source_flie_name = validate_and_normalize_path(self._source_flie_name) last_syscnt = 0 cycles = 0 kernel_label = tvm.get_global_func("ascend_get_kernel_label")() with open(self._source_flie_name, 'rb') as hwts_data: while True: # read 64 bit data line = hwts_data.read(64) if line: if not line.strip(): continue else: break byte_first_four = struct.unpack('BBHHH', line[0:8]) # byte_first[0:4] refers to count. byte_first[4] refers to is_warn_res0_0v. # byte_first[5:8] refers to the type of ms. byte_first = bin(byte_first_four[0]).replace('0b', '').zfill(8) ms_type = byte_first[-3:] is_warn_res0_ov = byte_first[4] cnt = int(byte_first[0:4], 2) core_id = byte_first_four[1] blk_id, task_id = byte_first_four[3], byte_first_four[4] stream_id, syscnt = self._parse_struct(ms_type, line, is_warn_res0_ov) if stream_id is None: logging.info("Profiling: invalid hwts log record type %s", ms_type) continue if int(task_id) < 25000: task_id = str(task_id) if kernel_label == (str(stream_id) + '_' + str(task_id)): if log_type[int(ms_type, 2)] == "Start of task": last_syscnt = syscnt elif log_type[int(ms_type, 2)] == "End of task": cycles += syscnt - last_syscnt if self._is_print: result_data += ("%-14s %-4s %-8s %-9s %-8s %-15s %s\n" % (log_type[int(ms_type, 2)], cnt, core_id, blk_id, task_id, syscnt, stream_id)) if self._is_print: fwrite_format(self._output_filename, data_source=self._dst_file_title, is_start=True) fwrite_format(self._output_filename, data_source=self._dst_file_column_title) fwrite_format(self._output_filename, data_source=result_data) return cycles if cycles != 0 else max_time_consume
def _build_to_gpu_func(desc_s, desc_d, attrs=None, poly=False): """ build kernel with compute description in json format Args: desc_s : str of compute description desc_d : dict of compute description attrs : dict of build attributes Returns: Module. """ if os.getenv('MS_GRAPH_KERNEL_TILING'): repository_gpu = read_repo_file( str(os.getenv('MS_GRAPH_KERNEL_TILING'))) elif 'buffer_stitch' in desc_d: repository_gpu = {} else: file_path = _get_repository_file_path("repository_gpu.json") repository_gpu = read_repo_file(file_path) def get_repo(keys, default=None): repo = repository_gpu for key in keys: repo = repo.get(key) if not repo: return default return repo if attrs is None: attrs = {'dim': ''} compute, shape, dtype = generate_trait(desc_d) batchmatmul = _is_batchmatmul(desc_d) if batchmatmul: shape = "any_shape" repo_attr = get_repo([compute, shape, dtype, 'metadata', 'attrs'], {}) if repo_attr and batchmatmul: repo_attr = _set_tiling_attrs(desc_d['output_desc'][0]['shape'], repo_attr) if not repo_attr: repo_attr = get_repo([compute, 'metadata', 'attrs'], {}) for a in repo_attr: if not attrs.get(a): attrs[a] = repo_attr[a] attr_list = ['dim', 'bind_block', 'bind_thread'] for item in attr_list: if attrs.get(item) in (None, ''): value = get_repo([compute, shape, dtype, item]) if value: attrs[item] = value if 'parallel_fusion' in desc_d or 'buffer_stitch' in desc_d: return _build_json_list_func(desc_d, attrs, poly, 'cuda') func = tvm.get_global_func("composite_with_json") return func(desc_s, attrs, poly)
def _get_feature(target, segment_tree, segment_infos): tune_composite = tvm.get_global_func("tune_composite") stmt, args = tune_composite(target, True, segment_tree, segment_infos) from akg.tvm import build_module binds, _ = build_module.get_binds(args) from akg.utils.auto_tuning import get_features_from_stmts feature = get_features_from_stmts(target=target, stmts=[stmt], binds=[binds], n_skip_cache=0)[0] return feature
def _build_to_func(desc_s, desc_d, attr=None, use_repo=True): """ build kernel with compute description in json format Args: desc_s : str of compute description desc_d : dict of compute description attr : dict of build attributes Returns: Module. """ if os.getenv('MS_GRAPH_KERNEL_TILING'): repository = read_repo_file(str(os.getenv('MS_GRAPH_KERNEL_TILING'))) else: file_path = _get_repository_file_path("repository.json") repository = read_repo_file(file_path) def get_repo(keys, default=None): repo = repository for key in keys: repo = repo.get(key) if not repo: return default return repo if attr is None: attr = {'dim': ''} # turn 'enable_auto_inline' off for composite op by default. if 'enable_auto_inline' not in attr: attr['enable_auto_inline'] = False if use_repo: compute, shape, dtype = generate_trait(desc_d) repo_attr = get_repo([compute, shape, dtype, 'metadata', 'attrs'], {}) if not repo_attr: repo_attr = get_repo([compute, 'metadata', 'attrs'], {}) for a in repo_attr: if not attr.get(a): attr[a] = repo_attr[a] if attr.get('dim') in (None, ''): tiling = get_repo([compute, shape, dtype, 'dim']) if tiling: attr['dim'] = tiling if 'parallel_fusion' in desc_d or 'buffer_stitch' in desc_d: return _build_json_list_func(desc_d, attr, True, 'cce') func = tvm.get_global_func("composite_with_json_to_func") return func(desc_s, attr)
def get_tiling_space(kernel_desc, level=1, attr=None): """ get tiling space of composite kernel Args: kernel_desc : str of compute description level : info level attr : dict of build attributes Returns: Module. """ if attr is None: attr = {} attr['help_tiling'] = level attr['tuning'] = 'on' desc_d = json.loads(kernel_desc) backend = desc_d['process'] all_ops = set(op['name'] for op in desc_d['op_desc']) if backend == "cuda": attr = _update_attrs_gpu(all_ops, attr, True) elif backend == "cpu": attr = _update_attrs_cpu(all_ops, attr, True) else: attr = _update_attrs_ascend(all_ops, attr) segment_tree, segment_infos = get_tune_construct_args(kernel_desc, attr) tune_composite = tvm.get_global_func("tune_composite") ret = tune_composite(backend, True, segment_tree, segment_infos) spaces = {} if attr.get("use_new_space", False): spaces['tune_space'] = ret else: spaces['index'] = ret.index_table.asnumpy().tolist() spaces['c1_range'] = ret.c1_tile_range_table.asnumpy().tolist() spaces['c0_range'] = ret.c0_tile_range_table.asnumpy().tolist() spaces['c1_mod'] = ret.c1_tile_mod_table.asnumpy().tolist() spaces['c0_mod'] = ret.c0_tile_mod_table.asnumpy().tolist() if level >= 2: spaces['tuning_space'] = ret.tiling_candidate.asnumpy().tolist() return spaces
def _build_to_func(desc_s, desc_d, attr=None): """ build kernel with compute description in json format Args: desc_s : str of compute description desc_d : dict of compute description attr : dict of build attributes Returns: Module. """ def get_repo(keys, default=None): repo = repository for key in keys: repo = repo.get(key) if not repo: return default return repo if attr is None: attr = {'dim': ''} # turn 'enable_auto_inline' off for composite op by default. if 'enable_auto_inline' not in attr: attr['enable_auto_inline'] = False compute, shape, dtype = generate_trait(desc_d) repo_attr = get_repo([compute, shape, dtype, 'metadata', 'attrs'], {}) if not repo_attr: repo_attr = get_repo([compute, 'metadata', 'attrs'], {}) for a in repo_attr: if not attr.get(a): attr[a] = repo_attr[a] if attr.get('dim') in (None, ''): tiling = get_repo([compute, shape, dtype, 'dim']) if tiling: attr['dim'] = tiling func = tvm.get_global_func("composite_with_json_to_func") return func(desc_s, attr)
def _build(desc_s, desc_d, attr=None): if desc_d['process'] == 'cuda': func = tvm.get_global_func("composite_with_json") return func(desc_s, attr) rst = _build_to_func(desc_s, desc_d, attr) return _api_internal._BuildToModule(rst)
def _build_json_list_func(desc_d, attrs, poly, target): func = tvm.get_global_func("composite_with_json_list") block_jsons, input_tensor_name, output_tensor_name, attrs_list, alloc_map_list, reuse_map_list, \ clean_op_map_list = _json_need_split(desc_d, attrs) return func(block_jsons, input_tensor_name, output_tensor_name, alloc_map_list, reuse_map_list, \ clean_op_map_list, attrs_list, poly, target)
def _build_to_module_ascend(desc_s_in, desc_d_in, attr=None, use_repo=True): """ build kernel with compute description in json format Args: desc_s_in : str of compute description desc_d_in : dict of compute description attr : dict of build attributes Returns: Module. """ repository = _get_repository("repository.json", desc_d_in) def _update_attr_by_repo(desc_s, desc_d, attr, given_attrs=None, support_online_tuning=True): def _auto_set_single_block(desc_d, attr): if not attr.get("enable_multicore", None) and desc_d.get( "extra", None): if desc_d["extra"].get("BlockMode", "") == "single_block": attr["enable_multicore"] = 0 return attr if attr is None: attr = {'dim': ''} all_ops = set(op['name'] for op in desc_d['op_desc']) attr = _update_attrs_ascend(all_ops, attr) attr = _auto_set_single_block(desc_d, attr) if given_attrs is not None: for key, value in given_attrs.items(): if not attr.get(key): attr[key] = value elif use_repo: compute, shape, dtype = generate_trait(desc_d) repo_attr = _get_repo_attr(desc_d, compute, shape, dtype, repository, False) attr = merge_attrs(attr, repo_attr) if attr.get('dim') in (None, ''): tiling = get_attr_from_dict([compute, shape, dtype, 'dim'], repository) if tiling: attr['dim'] = tiling elif support_online_tuning and 'online_tuning' in attr: attr = _get_online_tune_attr( desc_s, attr, get_repository_file_path("repository.json")) _, desc_s = _set_compute_attrs(desc_d, attr) return desc_s, attr def _get_parallel_repo(desc_d): compute, shape, dtype = generate_trait(desc_d) repo_attr = get_attr_from_dict([compute, shape, dtype, 'BlockPlan'], repository, {}) return repo_attr def _get_stitch_repo(desc_d): compute, shape, dtype = generate_trait(desc_d) repo_attr = get_attr_from_dict([compute, shape, dtype], repository, {}) return repo_attr def _parallel_postprocess(desc_d, json_str_list, attrs_list, _): parallel_repo = _get_parallel_repo(desc_d) if parallel_repo: # "BlockPlan" should be: [{"block_plan": x1, attr1: x2, attr2: x3}, ...] for i, [cur_json, cur_attr, cur_plan] in enumerate( zip(json_str_list, attrs_list, parallel_repo)): # When BlockPlan is active, the body should be run as single block cur_attr["enable_multicore"] = 0 json_str_list[i], attrs_list[i] = _update_attr_by_repo( cur_json, json.loads(cur_json), cur_attr, cur_plan[ConstructKey.ATTRS], False) else: for i, [cur_json, cur_attr] in enumerate(zip(json_str_list, attrs_list)): json_str_list[i], attrs_list[i] = _update_attr_by_repo( cur_json, json.loads(cur_json), cur_attr, None, False) return json_str_list, attrs_list def _stitch_postprocess(desc_d, stitch_jsons, attrs_list, _): def _stitch_combine_attrs(common_attr, sub_attrs): combine_attrs = [] for i, a in enumerate(sub_attrs): new_sub_attrs = {} for k, v in common_attr.items(): new_sub_attrs[k] = v if a: key = "sub_attr_" + str(i + 1) new_sub_attrs[key] = {} for k, v in a.items(): new_sub_attrs.get(key)[k] = v combine_attrs.append(new_sub_attrs) return combine_attrs origin_stitch_attrs = attrs_list[0] if origin_stitch_attrs.get("peeling") is None: # Read buffer stitch attr from repo stitch_repo = _get_stitch_repo(desc_d) if stitch_repo.get("peeling") is not None: origin_stitch_attrs.update(stitch_repo) elif "online_tuning" in attr: # If buffer stitch attr not in repo, use online tuning tuning_attr = _get_online_tune_attr( json.dumps(desc_d), origin_stitch_attrs, get_repository_file_path("repository.json")) origin_stitch_attrs.update(tuning_attr) # Update sub json attr common_attr, stitch_sub_attrs = split_stitch_attr( origin_stitch_attrs, len(stitch_jsons)) for i, cur_json_str in enumerate(stitch_jsons): stitch_jsons[i], stitch_sub_attrs[i] = _update_attr_by_repo( cur_json_str, json.loads(cur_json_str), stitch_sub_attrs[i], {}) stitch_attrs = _stitch_combine_attrs(common_attr, stitch_sub_attrs) return stitch_jsons, stitch_attrs def _normal_postprocess(desc_d, json_str_list, attrs_list, poly): _ = (desc_d, poly) # For unused warning... for i, (cur_json_str, cur_attr) in enumerate(zip(json_str_list, attrs_list)): json_str_list[i], attrs_list[i] = _update_attr_by_repo( cur_json_str, json.loads(cur_json_str), cur_attr) return json_str_list, attrs_list post_funcs = { ConstructType.PARALLEL: _parallel_postprocess, ConstructType.STITCH: _stitch_postprocess, ConstructType.NORMAL: _normal_postprocess, } segment_tree, segment_infos = get_construct_args(desc_s_in, attr, post_funcs) process = desc_d_in["process"] func = tvm.get_global_func("lower_composite_to_module") if "ret_mode" in attr: return _build_for_tuning(attr, func, process, segment_tree, segment_infos) return func(process, True, segment_tree, segment_infos)
def _build_to_module(desc_s, desc_d, attrs=None, poly=True): """ build kernel with compute description in json format Args: desc_s : str of compute description desc_d : dict of compute description attrs : dict of build attributes Returns: Module. """ def _update_attr_by_repo(desc_s, attrs): desc_d = json.loads(desc_s) process = desc_d["process"] file_name = "repository_" + process + ".json" repository = _get_repository(file_name, desc_d) all_ops = set(op["name"] for op in desc_d["op_desc"]) if attrs is None: attrs = {"dim": ""} compute, shape, dtype = generate_trait(desc_d) batchmatmul = "BatchMatMul" in all_ops if batchmatmul: shape = "any_shape" repo_attr = _get_repo_attr(desc_d, compute, shape, dtype, repository, batchmatmul) attrs = merge_attrs(attrs, repo_attr) attr_list = ["dim", "bind_block", "bind_thread" ] if process == "cuda" else ["dim"] for item in attr_list: if attrs.get(item) in (None, ""): value = get_attr_from_dict([compute, shape, dtype, item], repository) if value: attrs[item] = value if attrs.get("dim") in (None, "") and "online_tuning" in attrs: attrs = _get_online_tune_attr(desc_s, attrs, get_repository_file_path(file_name)) return desc_d, attrs def _post_update_attr(desc_s, attrs, poly): desc_d, attrs = _update_attr_by_repo(desc_s, attrs) all_ops = set(op["name"] for op in desc_d["op_desc"]) if desc_d["process"] == "cuda": attrs = _update_attrs_gpu(all_ops, attrs, poly) elif desc_d["process"] == "cpu": attrs = _update_attrs_cpu(all_ops, attrs, poly) return attrs def _common_postprocess(_, json_str_list, attrs_list, poly): for i, (cur_json_str, cur_attr) in enumerate(zip(json_str_list, attrs_list)): attrs_list[i] = _post_update_attr(cur_json_str, cur_attr, poly) return json_str_list, attrs_list def _stitch_postprocess(desc_d, json_str_list, attrs_list, poly): for i, cur_attr in enumerate(attrs_list): attrs_list[i] = _post_update_attr(json.dumps(desc_d), cur_attr, poly) return json_str_list, attrs_list post_funcs = { ConstructType.PARALLEL: _common_postprocess, ConstructType.STITCH: _stitch_postprocess, ConstructType.NORMAL: _common_postprocess, ConstructType.TOT: _common_postprocess, ConstructType.CONCAT: _common_postprocess } segment_tree, segment_infos = get_construct_args(desc_s, attrs, post_funcs) process = desc_d["process"] func = tvm.get_global_func("lower_composite_to_module") if "ret_mode" in attrs and poly: return _build_for_tuning(attrs, func, process, segment_tree, segment_infos) return func(process, poly, segment_tree, segment_infos)