def test_file_io(): temp = utils.tempdir() file_path = temp.relpath("temp.log") tsk, target = get_sample_task() inputs = [ MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10) ] results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)] invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10)) invalid_res = MeasureResult((10, ), 0, 0, 0) # Erase the entity map to test if it will be ignored when loading back. invalid_inp.config._entity_map = {} with open(file_path, "w") as fo: cb = autotvm.callback.log_to_file(fo) cb(None, inputs, results) cb(None, [invalid_inp], [invalid_res]) ref = zip(inputs, results) for x, y in zip(ref, autotvm.record.load_from_file(file_path)): assert x[1] == y[1]
def test_file_io(): temp = utils.tempdir() file_path = temp.relpath("temp.log") tsk, target = get_sample_task() inputs = [ MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10) ] results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)] invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10)) invalid_res = MeasureResult((10, ), 0, 0, 0) # Erase the entity map to test if it will be ignored when loading back. invalid_inp.config._entity_map = {} with open(file_path, "w") as fo: cb = autotvm.callback.log_to_file(fo) cb(None, inputs, results) cb(None, [invalid_inp], [invalid_res]) ref = zip(inputs, results) for x, y in zip(ref, autotvm.record.load_from_file(file_path)): assert x[1] == y[1] # Confirm functionality of multiple file loads hist_best = ApplyHistoryBest([file_path, file_path]) x = hist_best.query(target, tsk.workload) assert str(x) == str(inputs[0][2])
def test_apply_history_best(): tsk, target = get_sample_task() records = [(MeasureInput(target, tsk, tsk.config_space.get(0)), MeasureResult((0.1, ), 0, 2.3, 0)), (MeasureInput(target, tsk, tsk.config_space.get(1)), MeasureResult((0.3, ), 0, 2.3, 0)), (MeasureInput(target, tsk, tsk.config_space.get(2)), MeasureResult((0.01, ), 0, 2.3, 0)), (MeasureInput(target, tsk, tsk.config_space.get(4)), MeasureResult((0.4, ), 0, 2.3, 0))] hist_best = ApplyHistoryBest(records) x = hist_best.query(target, tsk.workload) assert str(x) == str(tsk.config_space.get(2))
def parse_one_log(best_config_log, new_log_dir): target_wkl = None for inp, res in load_from_file(best_config_log): # Update the target string to generate the right SHA2 hash code. if target_wkl is None: inp.task.target = inp.target target_wkl = Workload.from_task(inp.task) target_wkl['target'] = log_target if target_wkl not in wkls: new_log_file_name = gen_log_file_name_from_workload(target_wkl) new_log_path = '{0}/{1}'.format(new_log_dir, new_log_file_name) wkls[target_wkl] = (new_log_path, []) if res.error_no != 0: continue # Only focus on the best N configs. new_inp = MeasureInput(target=log_target, task=inp.task, config=inp.config) if len(wkls[target_wkl][1]) < top_n_cfgs: heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp)) elif np.mean(res.costs) < -wkls[target_wkl][1][0][0]: heapq.heappop(wkls[target_wkl][1]) heapq.heappush(wkls[target_wkl][1], (-np.mean(res.costs), new_inp))
def test_PBQPTuner_run(): target = "llvm" dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) conv2d = relay.op.get("nn.conv2d") target_ops = [conv2d] g, records, ltf_records, ltf_keys, tasks = _create_data( target, dshape, dtype, layout) costs = [0.02, 0.02, 0.045] config_list = [] cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost, ), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) executor = PBQPTuner(g, {"data": dshape}, records, target_ops, target) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [ records[3][0].config, records[1][0].config, records[2][0].config ] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out))
def test_DPTuner_run(): log_file = "%s/test_tuner.log" % (os.getcwd()) target = "llvm" dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) target_ops = [relay.nn.conv2d] g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout) mod = relay.module.Module() mod["main"] = g costs = [0.02, 0.02, 0.045] config_list = [] cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]], "t": ""} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) executor = DPTuner(mod, {"data": dshape}, records, target_ops, target, log_file=log_file) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [records[3][0].config, records[1][0].config, records[2][0].config] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out)) assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
def batch_loader(log_file, target, batch_size=8): """Batch loading measure inputs.""" tvm_target = tvm.target.create(target) batch = [] for inp, _ in load_from_file(log_file): # FIXME (comaniac): If we apply different target (e.g., llvm to cuda) then # the task might be missing. inp.task.target = tvm_target new_inp = MeasureInput(tvm_target, inp.task, inp.config) batch.append(new_inp) if len(batch) == batch_size: yield batch batch = [] yield batch
def test_load_dump(): task, target = get_sample_task() inp = MeasureInput(target, task, task.config_space.get(0)) result = MeasureResult((2.0, 2.23, 0.23, 0.123, 0.234, 0.123), MeasureErrorNo.NO_ERROR, 2.3, time.time()) for protocol in ['json', 'pickle']: row = encode(inp, result, protocol=protocol) inp_2, result_2 = decode(row, protocol=protocol) assert measure_str_key(inp) == measure_str_key(inp_2), \ "%s vs %s" % (measure_str_key(inp), measure_str_key(inp_2)) assert result.costs == result_2.costs assert result.error_no == result_2.error_no assert result.timestamp == result_2.timestamp
def measure_configs(self, transitions, n_parallel, measure_batch, callbacks): """ Measure results for current population. """ for i in range(ceil(len(transitions) / n_parallel)): configs = [] batch_size = min(n_parallel, len(transitions) - (i * n_parallel)) transitions_offset = (i * n_parallel) - 1 # Get configs for j in range(transitions_offset, transitions_offset + batch_size): gene = transitions[j].gene configs.append(self.space.get(knob2point(gene, self.dims))) # Measure batch inputs = [ MeasureInput(self.task.target, self.task, config) for config in configs ] results, end_time = measure_batch(inputs) # Unpack result for j in range(len(results)): self.step_count += 1 transition = transitions[transitions_offset + j] input, result = inputs[j], results[j] transition.input = inputs[j] transition.result = results[j] transition.score = input.task.flop / np.mean( result.costs) if result.error_no == 0 else 0.0 self.scores.append(transition.score) # Update best if transition.score > self.best_flops: self.best_flops = transition.score self.best_config = transition.input.config self.best_measure_pair = (transition.input, transition.result) self.best_iter = self.step_count for callback in callbacks: inputs = [t.input for t in transitions] results = [t.result for t in transitions] callback(self, inputs, results)
def test_file_io(): temp = util.tempdir() file_path = temp.relpath("temp.log") tsk, target = get_sample_task() inputs = [ MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10) ] results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)] with open(file_path, "w") as fo: cb = autotvm.callback.log_to_file(fo) cb(None, inputs, results) ref = zip(inputs, results) for x, y in zip(ref, autotvm.record.load_from_file(file_path)): assert x[1] == y[1]
def tune_kernels(tasks, gen_graph_tuner_candidates, measure_top_n, measure_option, tuner='random', early_stopping=None, n_trial=5000, log_filename='tuning.log'): """Tune kernels with the ranking model.""" remeasure_option = None if tuner == 'round': # Setup another measure option for final remeasurment. remeasure_option = autotvm.measure_option( builder=LocalBuilder(), runner=measure_option['runner'].local_runner, ) assert isinstance(measure_option['runner'], RankModelRunner) best_results = [] for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) callbacks = [] if task.name in [ 'dense_small_batch.cuda', 'conv2d_cudnn.cuda', 'dense_cublas.cuda', 'dense_large_batch.cuda', 'conv2d_transpose_nchw.cuda', 'dense_tensorcore.cuda' ]: # Ignore these four tasks continue if task.name not in measure_option['runner'].models: print('not covered by cost models') continue # create tuner if tuner == 'round': tuner_obj = RoundTuner(task, n_cfg=measure_top_n) callbacks = [rank_progress(n_trial, prefix=prefix) ] # Use different callbacks. else: if tuner in ('xgb', 'xgb-rank'): tuner_obj = XGBTuner(task, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(task, pop_size=50) elif tuner == 'random': tuner_obj = RandomTuner(task) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(task) else: raise ValueError("Invalid tuner: " + tuner) callbacks = [ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(log_filename) ] tic = time.time() # do tuning tuner_obj.tune(n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=callbacks) # Round tuner needs an extra measurement step to get the real throughputs. if tuner == 'round': max_n_layout = 20 if gen_graph_tuner_candidates else 1 top_cfgs = tuner_obj.get_top_rank_cfgs(max_n_layout) measure_batch = create_measure_batch(task, remeasure_option) inputs = [ MeasureInput(task.target, task, config) for config in top_cfgs ] sys.stderr.write('{} Measure Top {} Configs'.format( prefix, len(inputs))) results = measure_batch(inputs) best_idx, best_flops = max( [(idx, i.task.flop / np.mean(r.costs) / 1e9 if r.error_no == 0 else 0) for idx, (i, r) in enumerate(zip(inputs, results))], key=lambda x: x[1]) best_results.append((task.workload, best_idx, best_flops)) sys.stderr.write(' | Best %.2f GFLOPS at Top %d | %.2fs\n' % (best_flops, best_idx, time.time() - tic)) autotvm.callback.log_to_file(log_filename)(None, inputs, results) return best_results
def benchmark_layout_transform(self, min_exec_num=100, timeout=10, use_rpc=False, device_key=None, host="localhost", port=9190, n_parallel=1, build_func='default', layout_records=None, target_host=None, infer_layout=False): """Benchmark all possible layout transformation in the graph, given a set of schedule candidates for each workload of target operator. Parameters ---------- min_exec_num : int, optional Minimum number of execution. Final execution time is the average of all execution time. timeout : int, optional Time out for each execution. use_rpc : boolean, optional Whether to use rpc mode for benchmarking. device_key : str, optional Remote device key which can be queried by python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 host : str, optional IP address used to create RPC tracker on host machine. port : int, optional Port number used to create RPC tracker on host machine. n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). build_func: str or callable, optional 'default': call default builder. This works for normal target (llvm, cuda) 'ndk': use Android NDK to create shared library. Use this for android target. callable: customized build function for other backends (e.g. VTA). See autotvm/measure/measure_methods.py::default_build_func for example. layout_records : str or iterator of (MeasureInput, MeasureResult). optional Collection of layout_transform benchmarking records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. If this argument is set, graph tuner will first check whether layout_transform workload already exists in records and skip benchmarking if possible. target_host : str, optional str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. infer_layout : bool, optional Whether to infer layout transformation time if it doesn't exist in records, instead of benchmarking on target device. This might bring performance loss comparing to benchmarking layout transformation. """ self._logger.info("Start to benchmark layout transformation...") if layout_records is None and infer_layout: raise RuntimeError( "Requires some records to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) if not layout_records and infer_layout: raise RuntimeError( "Records must be non-empty to infer layout transformation time." ) if isinstance(layout_records, str): layout_records = load_from_file(layout_records) num_flops, total_time = 0, 0 if layout_records is not None: for record in layout_records: ltf_wkl = record[0].task.workload self._layout_transform_perf_records[ltf_wkl] = record input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops if num_flops > 0 else 0 args_list = [] def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Callback function to fetch layout transform args""" _, in_layout, out_layout = args if in_layout != out_layout: args_list.append(args) self._iterate_layout_transform(_fetch_args_callback) def _log_to_list(record_list): """Callback to log result to a list.""" def _callback(_, inputs, results): """Callback implementation""" record_list.append((inputs[0], results[0])) return _callback builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func) runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout) if use_rpc: if device_key is None: raise RuntimeError( "device_key need to be set to use rpc tracker mode.") runner = autotvm.measure.RPCRunner(device_key, host, port, n_parallel=n_parallel, number=min_exec_num, repeat=1, timeout=timeout) measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: data, in_layout, out_layout = args args = serialize_args(args) ltf_workload = ( 'layout_transform', ) + autotvm.task.args_to_workload(args) if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i # Rule out invalid layout transformations out = topi.layout_transform(data, in_layout, out_layout) out_flops = 1 for i in topi.util.get_const_tuple(out.shape): out_flops *= i if flops != out_flops: inferred_time = INVALID_LAYOUT_TIME else: inferred_time = flops * avg_time record_input = MeasureInput(target=self._target, task=None, config=None) record_output = MeasureResult(costs=(inferred_time, ), error_no=0, all_cost=-1, timestamp=-1) self._layout_transform_perf_records[ltf_workload] = ( record_input, record_output) continue records = [] task = autotvm.task.create(layout_transform, args=args, target=self._target, target_host=target_host) task.workload = ltf_workload tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) if not isinstance(records[0][1].costs[0], float): records[0] = (records[0][0], records[0][1]._replace( costs=(INVALID_LAYOUT_TIME, ))) self._layout_transform_perf_records[ltf_workload] = records[0] self._iterate_layout_transform(self._create_matrix_callback) self._logger.info("Benchmarking layout transformation successful.")
def test_tuple(): target = "llvm" dtype = "float32" dshape = (1, 5, 32, 32) layout = "NCHW" target_ops = [relay.nn.conv2d] data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") conv0 = relay.nn.conv2d(data, w0, channels=2, kernel_size=(3, 3), padding=(1, 1)) w1 = relay.var("w1_weight") conv1 = relay.nn.conv2d(data, w1, channels=3, kernel_size=(3, 3), padding=(1, 1)) out = relay.concatenate([conv0, conv1], axis=1) net = relay.Function(relay.analysis.free_vars(out), out) net, params = relay.testing.create_workload(net) tasks = autotvm.task.extract_from_program(net["main"], target=target, params=params, ops=(relay.op.nn.conv2d, )) wkl_list = [ create_workload((1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), ] costs = [0.01, 0.012, 0.03, 0.04] config_list = [] cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 2]], ["tile_ow", "sp", [4, 8]], ["unroll_kw", "ot", True]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 3]], ["tile_ow", "sp", [2, 16]], ["unroll_kw", "ot", False]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [2, 1]], ["tile_ow", "sp", [4, 8]], ["unroll_kw", "ot", True]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [3, 1]], ["tile_ow", "sp", [2, 16]], ["unroll_kw", "ot", False]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] wkl_list = wkl_list + wkl_list tasks = tasks + tasks for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): task.workload = wkl ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost, ), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [ tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c" ] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg) ltf_task = copy.deepcopy(tasks[0]) ltf_task.workload = ltf_wkl ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05, ), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) executor = DPTuner(net, {"data": dshape}, records, target_ops, target) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [records[2][0].config, records[1][0].config] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out)) executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [records[2][0].config, records[1][0].config] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out))
def _create_data(target, dshape, dtype, layout): data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1)) w1 = relay.var("w1_weight") conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1)) w2 = relay.var("w2_weight") conv2 = relay.nn.conv2d(conv1, w2, channels=32, kernel_size=(3, 3), padding=(1, 1)) out = relay.add(conv1, conv2) net = relay.Function(relay.analysis.free_vars(out), out) mod, params = relay.testing.create_workload(net) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.nn.conv2d, )) wkl_list = [ create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0), (1, 1), layout, layout, dtype, dtype), create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype), ] costs = [0.04, 0.012, 0.03] config_list = [] cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [3, 1]], ["tile_oc", "sp", [4, 4]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [2, 8]], ["tile_oc", "sp", [1, 32]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "i": -1, "c": None, "e": [["tile_ic", "sp", [8, 4]], ["tile_oc", "sp", [4, 8]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]], "t": "" } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): task.workload = wkl ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost, ), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [ tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c" ] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg) ltf_task = copy.deepcopy(tasks[0]) ltf_task.workload = ltf_wkl ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05, ), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) ltf_keys = [] ltf_arg = [ tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c" ] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) ltf_arg = [ tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c" ] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) ltf_arg = [ tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c" ] ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) ltf_wkl = ('layout_transform', ) + autotvm.task.args_to_workload(ltf_arg) ltf_keys.append(ltf_wkl) return net, records, ltf_records, ltf_keys, tasks
def test_triangle_block(): target = "llvm" dtype = "float32" dshape = (1, 3, 8, 8) layout = "NCHW" conv2d = relay.op.get("nn.conv2d") target_ops = [conv2d] data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1)) w1 = relay.var("w1_weight") conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1)) w2 = relay.var("w2_weight") conv2 = relay.nn.conv2d(data, w2, channels=32, kernel_size=(3, 3), padding=(1, 1)) out = relay.concatenate([conv0, conv1, conv2], axis=1) net = relay.Function(relay.analysis.free_vars(out), out) net, params = relay.testing.create_workload(net) tasks = autotvm.task.extract_from_program(net["main"], target=target, params=params, ops=(conv2d, )) costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045] config_list = [] cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [3, 1]], ["tile_oc", "sp", [4, 4]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [2, 8]], ["tile_oc", "sp", [1, 32]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [8, 4]], ["tile_oc", "sp", [4, 8]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [1, 3]], ["tile_oc", "sp", [2, 8]], ["tile_ow", "sp", [4, 2]], ["unroll_kw", "ot", True]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [4, 4]], ["tile_oc", "sp", [2, 16]], ["tile_oh", "ot", 1], ["tile_ow", "sp", [4, 2]]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = { "index": -1, "code_hash": None, "entity": [["tile_ic", "sp", [16, 2]], ["tile_oc", "sp", [8, 4]], ["tile_ow", "sp", [2, 4]], ["unroll_kw", "ot", False]] } config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] tasks = tasks + tasks for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost, ), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [ tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c" ] ltf_task = autotvm.task.create('layout_transform', ltf_arg, target) ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05, ), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) executor = DPTuner(net, {"data": dshape}, records, target_ops, target) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [ records[3][0].config, records[1][0].config, records[2][0].config ] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out)) executor = PBQPTuner(net, {"data": dshape}, records, target_ops, target) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True) executor.run() out = [record[0].config for record in executor.get_optimal_records()] expected_out = [ records[3][0].config, records[1][0].config, records[2][0].config ] assert expected_out == out, "Output mismatch: expecting %s but got %s" \ % (str(expected_out), str(out))
def tune_kernels( tasks, measure_top_n, measure_option, tuner="random", early_stopping=None, n_trial=5000, log_filename="tuning.log", ): """Tune kernels with the ranking model.""" remeasure_option = None if tuner == "round": # Setup another measure option for final remeasurment. remeasure_option = autotvm.measure_option( builder=LocalBuilder(), runner=measure_option["runner"].local_runner, ) assert isinstance(measure_option["runner"], RankModelRunner) for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) callbacks = [] if task.name not in measure_option["runner"].models: print("%s %s not covered by cost models" % (prefix, task.name)) continue # create tuner if tuner == "round": tuner_obj = RoundTuner(task, n_cfg=measure_top_n) callbacks = [rank_progress(n_trial, prefix=prefix) ] # Use different callbacks. else: if tuner in ("xgb", "xgb-rank"): tuner_obj = XGBTuner(task, loss_type="rank") elif tuner == "ga": tuner_obj = GATuner(task, pop_size=50) elif tuner == "random": tuner_obj = RandomTuner(task) elif tuner == "gridsearch": tuner_obj = GridSearchTuner(task) else: raise ValueError("Invalid tuner: " + tuner) callbacks = [ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(log_filename), ] tic = time.time() # do tuning tuner_obj.tune( n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=callbacks, ) # Round tuner needs an extra measurement step to get the real throughputs. if tuner == "round": top_cfgs = tuner_obj.get_top_rank_cfgs(1) measure_batch = create_measure_batch(task, remeasure_option) inputs = [ MeasureInput(task.target, task, config) for config in top_cfgs ] sys.stderr.write("{} Measure Top {} Configs".format( prefix, len(inputs))) results = measure_batch(inputs) best_idx, best_flops = max( [(idx, i.task.flop / np.mean(r.costs) / 1e9 if r.error_no == 0 else 0) for idx, (i, r) in enumerate(zip(inputs, results))], key=lambda x: x[1], ) sys.stderr.write(" | Best %.2f GFLOPS at Top %d | %.2fs\n" % (best_flops, best_idx, time.time() - tic)) autotvm.callback.log_to_file(log_filename)(None, inputs, results)