def parse_one_log(best_config_log, new_log_dir): target_wkl = None for inp, res in load_from_file(best_config_log): # Update the target string to generate the right SHA2 hash code. if target_wkl is None: inp.task.target = inp.target target_wkl = Workload.from_task(inp.task) target_wkl['target'] = log_target if target_wkl not in wkls: new_log_file_name = gen_log_file_name_from_workload(target_wkl) new_log_path = '{0}/{1}'.format(new_log_dir, new_log_file_name) wkls[target_wkl] = (new_log_path, []) if res.error_no != 0: continue # Only focus on the best N configs. new_inp = MeasureInput(target=log_target, task=inp.task, config=inp.config) if len(wkls[target_wkl][1]) < top_n_cfgs: heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp)) elif np.mean(res.costs) < -wkls[target_wkl][1][0][0]: heapq.heappop(wkls[target_wkl][1]) heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp))
def batch_loader(log_file, target, batch_size=8): """Batch loading measure inputs.""" tvm_target = tvm.target.create(target) batch = [] for inp, _ in load_from_file(log_file): # FIXME (comaniac): If we apply different target (e.g., llvm to cuda) then # the task might be missing. inp.task.target = tvm_target new_inp = MeasureInput(tvm_target, inp.task, inp.config) batch.append(new_inp) if len(batch) == batch_size: yield batch batch = [] yield batch
def _fetch_cfg(self): """Read and pre-process input schedules.""" if isinstance(self._records, str): records = load_from_file(self._records) else: records = self._records cfg_dict = {} for record in records: in_measure, _ = record workload = in_measure.task.workload if workload not in cfg_dict: cfg_dict[workload] = [] cfg_dict[workload].append(record) cache_dict = {} for key in self._in_nodes_dict: node_entry = self._node_list[key] if node_entry["op"] not in self._target_ops: continue workload = node_entry["workloads"][0] if workload in cache_dict: node_entry["record_candidates"] = cache_dict[workload] continue record_candidates = [] infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]] layout_tracking_dict = {} for record in cfg_dict[workload]: in_measure, out_measure = record workload = in_measure.task.workload cfg = in_measure.config # For multiple cfgs which produces the same in/out layouts, # only the most efficient one is preserved. with self._target: layouts = infer_layout_func(workload, cfg) if layouts in layout_tracking_dict: cost = out_measure.costs[0] current_best_cost = layout_tracking_dict[layouts][ 1].costs[0] if cost < current_best_cost: layout_tracking_dict[layouts] = record else: layout_tracking_dict[layouts] = record sorted_records = sorted(layout_tracking_dict.values(), key=lambda item: item[1].costs[0]) for i in range(min(self._max_sch_num, len(sorted_records))): record_candidates.append(sorted_records[i]) node_entry["record_candidates"] = record_candidates cache_dict[workload] = record_candidates
def extract_feature_from_file(log_file: str, out_path: str): """Parse a log file and extract featues to the output file""" data: Dict[Tuple[str, str, str], List[str]] = {} cnt = 0 for inp, res in load_from_file(log_file): cnt += 1 key = (gen_key_str(inp), gen_target_str(inp), gen_file_str(inp)) if key not in data: data[key] = [] try: features = extract_feature(inp) except Exception as err: # pylint: disable=broad-except return str(err) # Compute GFLOP/s task = create(inp.task.name, inp.task.args, inp.target) if res.error_no == 0: features['thrpt'] = np.around(task.flop / 1e9 / np.mean(res.costs), 2).tolist() else: features['thrpt'] = 0 data[key].append(json.dumps(features)) for (_, target_key, file_key), feats in data.items(): if not os.path.exists(os.path.join(out_path, target_key)): os.mkdir(os.path.join(out_path, target_key)) out_file = '{0}/{1}/{2}.json'.format(out_path, target_key, file_key) lock_file = '{0}.lock'.format(out_file) with FileLock(lock_file): with open(out_file, 'a') as filep: for record in feats: filep.write(record) filep.write('\n') return None
def benchmark_layout_transform(self, min_exec_num=100, timeout=10, use_rpc=False, device_key=None, host="localhost", port=9190, n_parallel=1, build_func='default', layout_records=None, target_host=None, infer_layout=False): """Benchmark all possible layout transformation in the graph, given a set of schedule candidates for each workload of target operator. Parameters ---------- min_exec_num : int, optional Minimum number of execution. Final execution time is the average of all execution time. timeout : int, optional Time out for each execution. use_rpc : boolean, optional Whether to use rpc mode for benchmarking. device_key : str, optional Remote device key which can be queried by python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 host : str, optional IP address used to create RPC tracker on host machine. port : int, optional Port number used to create RPC tracker on host machine. n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). build_func: str or callable, optional 'default': call default builder. This works for normal target (llvm, cuda) 'ndk': use Android NDK to create shared library. Use this for android target. callable: customized build function for other backends (e.g. VTA). See autotvm/measure/measure_methods.py::default_build_func for example. layout_records : str or iterator of (MeasureInput, MeasureResult). optional Collection of layout_transform benchmarking records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. If this argument is set, graph tuner will first check whether layout_transform workload already exists in records and skip benchmarking if possible. target_host : str, optional str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. infer_layout : bool, optional Whether to infer layout transformation time if it doesn't exist in records, instead of benchmarking on target device. This might bring performance loss comparing to benchmarking layout transformation. """ self._logger.info("Start to benchmark layout transformation...") if layout_records is None and infer_layout: raise RuntimeError( "Requires some records to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) if not layout_records and infer_layout: raise RuntimeError( "Records must be non-empty to infer layout transformation time." ) if isinstance(layout_records, str): layout_records = load_from_file(layout_records) num_flops, total_time = 0, 0 if layout_records is not None: for record in layout_records: ltf_wkl = record[0].task.workload self._layout_transform_perf_records[ltf_wkl] = record input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops if num_flops > 0 else 0 args_list = [] def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Callback function to fetch layout transform args""" _, in_layout, out_layout = args if in_layout != out_layout: args_list.append(args) self._iterate_layout_transform(_fetch_args_callback) def _log_to_list(record_list): """Callback to log result to a list.""" def _callback(_, inputs, results): """Callback implementation""" record_list.append((inputs[0], results[0])) return _callback builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func) runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout) if use_rpc: if device_key is None: raise RuntimeError( "device_key need to be set to use rpc tracker mode.") runner = autotvm.measure.RPCRunner(device_key, host, port, n_parallel=n_parallel, number=min_exec_num, repeat=1, timeout=timeout) measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: data, in_layout, out_layout = args args = serialize_args(args) ltf_workload = ( 'layout_transform', ) + autotvm.task.args_to_workload(args) if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i # Rule out invalid layout transformations out = topi.layout_transform(data, in_layout, out_layout) out_flops = 1 for i in topi.util.get_const_tuple(out.shape): out_flops *= i if flops != out_flops: inferred_time = INVALID_LAYOUT_TIME else: inferred_time = flops * avg_time record_input = MeasureInput(target=self._target, task=None, config=None) record_output = MeasureResult(costs=(inferred_time, ), error_no=0, all_cost=-1, timestamp=-1) self._layout_transform_perf_records[ltf_workload] = ( record_input, record_output) continue records = [] task = autotvm.task.create(layout_transform, args=args, target=self._target, target_host=target_host) task.workload = ltf_workload tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) if not isinstance(records[0][1].costs[0], float): records[0] = (records[0][0], records[0][1]._replace( costs=(INVALID_LAYOUT_TIME, ))) self._layout_transform_perf_records[ltf_workload] = records[0] self._iterate_layout_transform(self._create_matrix_callback) self._logger.info("Benchmarking layout transformation successful.")
def run_one_wkl(platform1_log, platform2_dirs): target_wkl = None cfgs = [] log_files = None for inp, res in load_from_file(platform1_log): # Update the target string to generate the same SHA2 hash code # to identify the full config log file. if target_wkl is None: inp.task.target = inp.target target_wkl = Workload.from_task(inp.task) target_wkl['target'] = log_target # The full config log file name by Lorien is composed of # <SHA2>-<5-byte UUID4>.json log_files = [] for platform2_dir in platform2_dirs: log_path = '{0}/{1}*.json'.format(platform2_dir, target_wkl.hash_sha2()) log_files += glob.glob(log_path) if not log_files: print('Log missing for %s: %s' % (str(target_wkl), target_wkl.hash_sha2())) return # Only focus on the best N configs. target_cfg_str = str(inp.config) if len(cfgs) < top_n_cfgs: heapq.heappush(cfgs, (-np.mean(res.costs), target_cfg_str)) elif np.mean(res.costs) < -cfgs[0][0]: heapq.heappop(cfgs) heapq.heappush(cfgs, (-np.mean(res.costs), target_cfg_str)) # Load and sort all configs. assert log_files is not None all_records = {} for log_file in log_files: for inp, res in load_from_file(log_file): # De-duplication. cfg = str(inp.config) if cfg not in all_records: all_records[cfg] = (inp, res) else: old_res = all_records[cfg][1] all_records[cfg] = (inp, res if np.mean(res.costs) < np.mean(old_res.costs) else old_res) all_records = sorted(all_records.values(), key=lambda p: np.mean(p[1].costs)) cfg_to_rank_on_p2 = {str(inp.config): rank for rank, (inp, _) in enumerate(all_records)} mapped = [False for _ in range(len(cfg_to_rank_on_p2) + 1)] assert target_wkl is not None #task = target_wkl.to_task() #space_size = np.product([len(v.entities) for v in task.config_space.space_map.values()]) cfgs.sort(key=lambda x: -x[0]) for rank1, (target_cost, target_cfg_str) in enumerate(cfgs): target_cost = -target_cost #display_name = '{} {} rank1 {}'.format(str(task), target_cfg_str, rank1) # Map the rank from the first platform to the second platform. if target_cfg_str not in cfg_to_rank_on_p2: continue rank2 = cfg_to_rank_on_p2[target_cfg_str] rank_shift = rank1 - rank2 mapped[rank2] = True #print('{:40s}\trank2 {:5d}\t{:5d}\t{:10d}'.format(display_name, # rank2, len(all_records), space_size)) #print('%d\t%d' % (rank1, rank2)) if rank_shift not in hist: hist[rank_shift] = 0 hist[rank_shift] += 1 #if rank_shift <= -2000: # print('{}\trank2 {:5d}\t{:5d}\t{:10d}'.format(display_name, rank2, len(all_records), space_size)) return
# The config of the first platform. platform1_dir = sys.argv[1] # The config folder of the second platform. # When the target config is missing in the full log, # we may re-tune it and put it to another log file. # In short, when two platform2 dirs are specified, # we will aggregate their logs as the reference. platform2_dirs = [sys.argv[2]] if len(sys.argv) == 4: platform2_dirs.append(sys.argv[3]) log_target = None for log_file in glob.glob('{}/*'.format(platform2_dirs[0])): for inp, res in load_from_file(log_file): log_target = str(inp.target) break break print(log_target) # Histogram of rank shifting counts hist = {} def run_one_wkl(platform1_log, platform2_dirs): target_wkl = None cfgs = [] log_files = None for inp, res in load_from_file(platform1_log): # Update the target string to generate the same SHA2 hash code # to identify the full config log file.
# Number of top configs. top_n_cfgs = 5000 # The config found by the rank model to be analyzed best_config_log = sys.argv[1] # The full config by AutoTVM to be referenced. all_log_dir = sys.argv[2] # The target string in the full config log file. log_target = sys.argv[3] wkl_to_log_file = {} missed = 0 total = 0 for inp, res in load_from_file(best_config_log): # Update the target string to generate the same SHA2 hash code # to identify the full config log file. inp.task.target = inp.target target_wkl = Workload.from_task(inp.task) target_wkl['target'] = log_target target_cfg_str = str(inp.config) if target_wkl not in wkl_to_log_file: # The full config log file name by Lorien is composed of # <SHA2>-<5-byte UUID4>.json total += 1 log_path = '{0}/{1}*.json'.format(all_log_dir, target_wkl.hash_sha2()) log_files = glob.glob(log_path) if not log_files: print('Log missing for %s' % str(target_wkl))