def autotvm_tuning_opt(target, log_file, dtype="float32"): if "cpu" in target.keys: print("enable cpu tuning options") measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True), ) else: print("enable gpu tuning options") measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), ) tuning_option = { "log_filename": log_file, "tuner": "xgb", "early_stopping": None, "measure_option": measure_option } return tuning_option
def test_tuning_gpu(target, ctx): # init task task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) results = [] tuner = RandomTuner(task) tuner.tune( n_trial=20, measure_option=measure_option, callbacks=(lambda _tuner, _inputs, rs: results.extend(rs), ), ) assert len(results) == 20 successful_results = [ r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR ] assert len( successful_results) > 0, f"No successful tuning runs: {results!r}"
def _tune_topi_cuda(self, name, args, te_tensors, tune_kwargs): n_trial = tune_kwargs.get('n_trial', 40) preserve_log = tune_kwargs.get('preserve_log', False) tmp_file_name = slugify(name) + '.topi_cuda.log' if n_trial > 0: task = autotvm.task.create(self.topi_cuda_task_name, args=args, target='cuda') tuner = tune_kwargs.get('tuner', autotvm.tuner.XGBTuner(task)) tuner.tune( n_trial=n_trial, measure_option={ 'builder': tune_kwargs.get('builder', autotvm.LocalBuilder()), 'runner': tune_kwargs.get( 'runner', autotvm.LocalRunner(timeout=20, **default_tune_eval_settings)), }, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=f'TOPI {name}'), autotvm.callback.log_to_file(tmp_file_name), *tune_kwargs.get('callbacks', []) ]) with autotvm.apply_history_best(tmp_file_name): result = self._build_topi_cuda(name, args, te_tensors) if not preserve_log: os.remove(tmp_file_name) return result
def test_tuning_cpu(): ir_mod = tvm.parser.fromtext( textwrap.dedent(""" #[version = "0.0.5"] def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) { nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW") } """)) tasks = autotvm.task.relay_integration.extract_from_program( ir_mod, {}, tvm.target.create("llvm")) assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}" task = tasks[0] measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) results = [] tuner = RandomTuner(task) tuner.tune( n_trial=20, measure_option=measure_option, callbacks=(lambda _tuner, _inputs, rs: results.extend(rs), ), ) assert len(results) == 20 successful_results = [ r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR ] assert len( successful_results) > 0, f"No successful tuning runs: {results!r}"
def test_convolution(n_trial=2000, early_stopping=400, learn_start=50, memory_capacity=1000, update_frequency=50, discount=0.99, epsilon=(1.0, 0.01, 0.99)): """ Test simple convolution with RLTuner. """ mod, params = _get_relay_convolution() tasks = autotvm.task.extract_from_program( mod["main"], target=target, target_host=tvm.target.Target("llvm"), params=params) runner = autotvm.LocalRunner(number=1, repeat=4) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner) prefix = f"[Task 1/1]" tuner_obj = GADQNTuner(tasks[0], learn_start=learn_start, memory_capacity=memory_capacity, update_frequency=update_frequency, discount=discount, epsilon=epsilon, debug=True) tuner_obj.tune( n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(n_trial, prefix=prefix)])
def main(): target = tvm.target.arm_cpu() batch_size = 1 dtype = 'uint8' quant_model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz" model_name = "mobilenet_v1_1.0_224_quant" log_file = "%s.log" % model_name input_tensor = "input" tuning_option = { 'log_filename': log_file, 'tuner': 'random', 'early_stopping': 800, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), ),} mod, params, data_shape = tune(tuning_option, target, quant_model_url, model_name, batch_size, input_tensor, need_tune=False) evaluate(log_file, mod, params, target, input_tensor, data_shape, input_dtype=dtype)
def _test_op_with_ga(save_path, save_name, workload_name, n_trial, early_stopping): """ Test a specified single workload with GA tuner. """ print(f"Running experiment with settings: n trial: {n_trial}, " f"early stopping: {early_stopping}") mod, params = _get_relay_workload(workload_name) tasks = autotvm.task.extract_from_program( mod["main"], target=target, target_host=tvm.target.Target("llvm"), params=params) runner = autotvm.LocalRunner(number=1, repeat=4) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner) prefix = f"[Task 1/1]" tuner_obj = GATuner(tasks[0], debug=True) tuner_obj.tune( n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(n_trial, prefix=prefix)]) tuner_obj.save_model(save_path, save_name)
def runner(target): # init task task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) results = [] tuner = RandomTuner(task) tuner.tune( n_trial=20, measure_option=measure_option, callbacks=(lambda _tuner, _inputs, rs: results.extend(rs), ), ) assert len(results) == 20 successful_results = [ r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR # We filter records before building if we know they won't work ahead of time. # We can't guarantee we get one good record so we count these as success too or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR ] assert len( successful_results) > 0, f"No successful tuning runs: {results!r}"
def auto_schedule(func, args): global function function = func logFile = open("matmul.log", 'w', encoding="utf-8") logFile.truncate() logFile.close() # return s, bnfs autotvmFunc = None config_sp_size = 0 if len(args) == 4: config_sp_size = 100 autotvmFunc = GEMMAutoTVM else: config_sp_size = 200 autotvmFunc = CONVAutoTVM task = autotvm.task.create(autotvmFunc, args=(args), target='llvm') print(task.config_space) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=3)) # begin tuning, log records to file `matmul.log` tuner = autotvm.tuner.GATuner(task) tuner.tune(n_trial=config_sp_size, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) with autotvm.apply_history_best('matmul.log'): with tvm.target.create("llvm"): s, arg_bufs = autotvmFunc(*args) print(tvm.lower(s, arg_bufs, simple_mode=True)) return s, arg_bufs
def test_check_correctness(): task, target = get_sample_task() measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(check_correctness=True)) def _callback_correct(tuner, measure_inputs, measure_results): for _, res in zip(measure_inputs, measure_results): assert res.error_no == 0 tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_correct]) # a bad template n = 128 target = tvm.target.Target("llvm -device=bad_device") task = autotvm.task.create("testing/bad_matmul", args=(n, n, n, "float32"), target=target) def _callback_wrong(tuner, measure_inputs, measure_results): for _, res in zip(measure_inputs, measure_results): assert res.error_no == MeasureErrorNo.WRONG_ANSWER tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_wrong])
def create_measure(device, flag="t4"): if device == 'arm' or device == 'aarch64': measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.RPCRunner( "pi", host='0.0.0.0', port=9190, number=5, timeout=10, )) elif ('x86' in device): measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=5, repeat=1, min_repeat_ms=1000), ) elif device == 'gpu': measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=1000), runner=autotvm.RPCRunner( flag, # change the device key to your key '0.0.0.0', 9190, number=20, repeat=3, timeout=1000, min_repeat_ms=150)) return measure_option
def run(name, N, H, W, CO, CI, KH, KW, stride, pad, dilation): N, H, W, CO, CI, KH, KW, strides, padding = N, H, W, CO, CI, KH, KW, (stride, stride), (pad, pad) task = autotvm.task.create(conv2d_nchw, args=(N, H, W, CO, CI, KH, KW, strides, padding, dilation), target='cuda') print(task.config_space) logfile = "conv2d_" + name + ".log" # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=10) ) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(logfile)]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best(logfile) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(logfile): with tvm.target.create("cuda"): s, arg_bufs = conv2d_nchw(N, H, W, CO, CI, KH, KW, strides, padding, dilation) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) # c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty((N, CO, (H + 2 * pad - KH) // stride + 1, (W + 2 * pad - KW) // stride + 1), ctx=ctx) # func(a_tvm, w_tvm, c_tvm) # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=10) cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3 print('Time cost of this operator: %f' % cost) with open("autotvm_conv_nchw.txt", "a") as f: f.write("name, {}\n".format(cost))
def test_tuning(target, ctx): # init task task, target = get_sample_task(target, None) logging.info("%s", task.config_space) measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) tuner = RandomTuner(task) tuner.tune(n_trial=20, measure_option=measure_option)
def tune_cuda_tile(name, tree, kernel_args, parser, n_trial=40, tuner=None, measure_option=None, callbacks=None, preserve_log=False): tmp_file_name = slugify(name) + '.cuda_tile.log' task = CUDATileTask(name, tree.copy(), kernel_args, parser) from random import randint stmt, args = task.instantiate( task.config_space.get(randint(0, len(task.config_space) - 1))) kernel = tvm.build(stmt, name=name, target='cuda') if n_trial > 0: if tuner is None: tuner = autotvm.tuner.XGBTuner(task, feature_type='knob') else: tuner = tuner(task) tuner.tune( n_trial=n_trial, measure_option={ 'builder': autotvm.LocalBuilder(), 'runner': autotvm.LocalRunner(timeout=20, **default_tune_eval_settings), **(measure_option or {}), }, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=f'CUDATile {name}'), autotvm.callback.log_to_file(tmp_file_name), *(callbacks or []) ]) best, best_cost = load_best(tmp_file_name, task) import gc gc.collect() if not best: raise Exception('failed to build kernel') best = CUDATileConfigEntity.from_json_dict(best) print('CUDATile %s: best %s, best cost %.12f' % (name, repr(best), best_cost)) if not preserve_log: os.remove(tmp_file_name) return best, best_cost
def tune_kernels(tasks, builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), tuner='ga', early_stopping=None, log_filename=log_file): measure_option = autotvm.measure_option(builder, runner) for i, tsk in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # converting conv2d tasks to conv2d_NCHWc tasks if tsk.workload: op_name = tsk.workload[0] if op_name == 'conv2d': func_create = 'topi_x86_conv2d_NCHWc' elif op_name == 'depthwise_conv2d_nchw': func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw' else: raise ValueError( "Tuning {} is not supported on x86".format(op_name)) task = autotvm.task.create(func_create, args=tsk.args, target=target, template_key='direct') task.workload = tsk.workload else: task = tsk # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(task, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(task, pop_size=1000) elif tuner == 'random': tuner_obj = RandomTuner(task) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(task) else: raise ValueError("Invalid tuner: " + tuner) # do tuning n_trial = len(task.config_space) print("n_trial", n_trial) tuner_obj.tune(n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(log_filename) ])
def test_gemm(mm, nn, ll): # correctness m, n, l = mm, nn, ll dtype = 'float32' logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) log_file = 'gemm.log' task = autotvm.task.create('gemm-autotune/gemm_v2', args = (m, n, l), target='cuda') print(task.config_space) measure_option = autotvm.measure_option( builder = autotvm.LocalBuilder(), runner = autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4) ) tuner = autotvm.tuner.XGBTuner(task, feature_type='knob') tuner.tune(n_trial=1000, measure_option = measure_option, callbacks = [autotvm.callback.log_to_file(log_file)]) dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task.target, task.workload) print('\nBest config:') print(best_config) with autotvm.apply_history_best(log_file): with tvm.target.create('cuda'): s, arg_bufs = gemm_autotune(m, n, l) f = tvm.build(s, arg_bufs) # launch the kernel. # a_np = np.random.uniform(size=(n, l)).astype(A.dtype) # b_np = np.random.uniform(size=(m, l)).astype(B.dtype) ctx = tvm.gpu(0) a_np = np.random.uniform(size=(l, n)).astype(dtype) b_np = np.random.uniform(size=(l, m)).astype(dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((m, n), dtype=dtype), ctx) for i in range(2): f(a, b, c) print('function called') tvm.testing.assert_allclose( c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * mm * ll num_runs = 10 timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e9 print("average time cost of %d runs = %g ms, %g TFLOPS." % (num_runs, t * 1e3, GFLOPS))
def test_sparse_dense_bsr_autotune(M, N, K, BS_R, BS_C, density): """Benchmark sparse-dense matrix multiplication with auto tuning enabled""" print("testing param", M, N, K, BS_R, BS_C, density) X_np = np.random.randn(M, K).astype("float32") W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32") W_np = W_sp_np.todense() Y_np = X_np.dot(W_np.T) # logging config (for printing tuning log to screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) W_sp_np_data_shape, W_sp_np_indices_shape, W_sp_np_indptr_shape, X_np_shape = W_sp_np.data.shape, W_sp_np.indices.shape, W_sp_np.indptr.shape, X_np.shape task = autotvm.task.create("benchmark/block_sparse", args=(W_sp_np_data_shape, W_sp_np_indices_shape, W_sp_np_indptr_shape, X_np_shape), target='cuda') # Use local gpu, measure multiple times for every config to reduce variance # The timeout for running is 4 seconds measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=args.repeat, min_repeat_ms=100, timeout=4) ) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) if args.tune: tuner.tune(n_trial=args.n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(args.autotvm_log)]) # apply history best from log file with autotvm.apply_history_best(args.autotvm_log): with tvm.target.create("cuda"): s, arg_bufs = block_sparse_template(W_sp_np_data_shape, W_sp_np_indices_shape, W_sp_np_indptr_shape, X_np_shape) func = tvm.build(s, arg_bufs) timer = func.time_evaluator(func.entry_name, context, number=20) Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=context) mean_time = timer(tvm.nd.array(X_np, ctx=context), tvm.nd.array(W_sp_np.data, ctx=context), tvm.nd.array(W_sp_np.indices, ctx=context), tvm.nd.array(W_sp_np.indptr, ctx=context), Y_tvm).mean print('%g ms' % (mean_time * 1e3)) print("------------------------") tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-4, rtol=1e-4)
def _test_op_with_dqnga(save_path, save_name, workload_name, n_trial, early_stopping, learn_start, update_frequency, train_frequency, discount, epsilon_decay, agent_batch_size, hidden_sizes, learning_rate, reward_function=RewardFunction.R3): """ Test a specified single workload using RLTuner. """ print( f"Running experiment with settings: n trial: {n_trial}, " f"early stopping: {early_stopping}, learn start: {learn_start}, " f"update frequency: {update_frequency}, discount: {discount}, " f"ep decay: {epsilon_decay}, hidden sizes: {hidden_sizes}," f"agent batch size: {agent_batch_size}, learning rate: {learning_rate}" ) mod, params = _get_relay_workload(workload_name) tasks = autotvm.task.extract_from_program( mod["main"], target=target, target_host=tvm.target.Target("llvm"), params=params) runner = autotvm.LocalRunner(number=1, repeat=4) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner) prefix = f"[Task 1/1]" tuner_obj = GADQNTuner(tasks[0], learn_start=learn_start, target_update_frequency=update_frequency, train_frequency=train_frequency, discount=discount, epsilon_decay=epsilon_decay, agent_batch_size=agent_batch_size, hidden_sizes=hidden_sizes, learning_rate=learning_rate, reward_function=reward_function) tuner_obj.tune( n_trial=n_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[autotvm.callback.progress_bar(n_trial, prefix=prefix)]) tuner_obj.save_model(save_path, save_name)
def run_one_wkl(wkl, new_log_path, inputs): task = wkl.to_task() # Re-tune the best configs. log_writter = log_to_file(new_log_path) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=5, repeat=1, min_repeat_ms=1000)) measure_batch = create_measure_batch(task, measure_option) results = measure_batch(inputs) log_writter(None, inputs, results) del measure_batch return
def check(target, target_host): ctx = tvm.context(target, 0) if not ctx.exist: logging.info("Skip test because %s is not available" % target) return # init task task, target = get_sample_task(target, target_host) logging.info("%s", task.config_space) measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) tuner = RandomTuner(task) tuner.tune(n_trial=20, measure_option=measure_option)
def execute_autotune_task(template_name, dtype, N, L, M, target, n_trial, number, log_path): task = autotvm.task.create(template_name, args=(N, L, M, dtype), target=target) print(task.config_space) measure_option = autotvm.measure_option( builder="local", runner=autotvm.LocalRunner(number=number)) # Begin tuning with RandomTuner, log records to file `matmul.log` # You can use alternatives like XGBTuner. tuner = autotvm.tuner.RandomTuner(task) tuner.tune( n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_path)], )
def main(): parser = argparse.ArgumentParser(description='Tune ops') parser.add_argument('output', type=str) parser.add_argument('ops', type=str, nargs='+') parser.add_argument('--batchsize', type=int) parser.add_argument('--base', type=str) parser.add_argument('--target', type=str, default='cuda') args = parser.parse_args() tasks = read_tasks(args.ops, args.batchsize) print('Read %d tasks from %d files' % (len(tasks), len(args.ops))) if args.base: base_config = TVMConfig(args.base) discard_keys = [] for task_key, (filename, task) in tasks.items(): query = autotvm_key_from_task(task) if base_config.contains(query): print('%s is already tuned' % filename) discard_keys.append(task_key) for task_key in discard_keys: tasks.pop(task_key) print('Removed %d tasks. Will tune for %d tasks.' % (len(discard_keys), len(tasks))) tuning_opt = { 'log_filename': args.output, 'tuner': 'xgb', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4), ), } tvm_tasks = [] for task_key, (filename, task) in tasks.items(): print('Tuning for %s' % filename) tvm_tasks.append(autotvm_task(task, args.target)) tune_tasks(tvm_tasks, **tuning_opt)
def tuning_model(model_path): dtype='float32' ox, shape_dict = get_model(model_path) input_name = list(shape_dict.keys())[0] device_key = None if args.target == 'gpu': device_key = 'V100' use_android = False log_file = get_logfile() other_option = { 'model_path': model_path, 'dtype': dtype, 'input_name': input_name, 'device_key': device_key, 'use_android': use_android } if args.target == 'x86' or args.target == 'cpu': measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( number=10, repeat=1, min_repeat_ms=1000 ) ) elif args.target == 'gpu': measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.RPCRunner( device_key, '0.0.0.0', 9190, number=20, repeat=3, timeout=4, min_repeat_ms=150) ) n_trial = 200 tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': n_trial, 'early_stopping': 80, 'measure_option': measure_option } graph, lib, params = tuning(tuning_option, **other_option) return graph, lib, params
def test_min_repeat_ms(): task, target = get_sample_task() measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( number=1, min_repeat_ms=100)) def _callback(tuner, measure_inputs, measure_results): for inp, res in zip(measure_inputs, measure_results): if res.error_no != 0: continue assert 1000 * np.mean(res.costs) * \ measure_option['runner'].cur_number >= 100 tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=5, measure_option=measure_option, callbacks=[_callback])
def test_graph_tuner_layout_transform_runner(): log_file = "%s/test_tuner.log" % (os.getcwd()) target = "llvm" dshape = (1, 3, 8, 8) dtype = "float32" layout = "NCHW" conv2d = relay.op.get("nn.conv2d") target_ops = [conv2d] g, records, ltf_records, ltf_keys, _ = _create_data( target, dshape, dtype, layout) executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file) runner = autotvm.LocalRunner(number=100, repeat=1, timeout=10) executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True, runner=runner) out = executor._layout_transform_perf_records num_flops = 0 total_time = 0 for record in ltf_records: ltf_wkl = record[0].task.workload input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops for ltf_workload in out: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i expected_time = flops * avg_time out_time = out[ltf_workload][1].costs[0] assert ( expected_time == out_time ), "Inferred layout transformation time mismatch for %s: " "expecting %f but got %f" % ( str(ltf_workload), expected_time, out_time, )
def runner(target, dev): task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder() measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner()) results = [] tuner = RandomTuner(task) tuner.tune( n_trial=1, measure_option=measure_option, callbacks=(lambda _tuner, _inputs, rs: results.extend(rs), ), ) assert len(results) == 1
def get_tuning_opt(log_file="tuning.log", n_trial=200): """Returns tuning options""" tuning_opt = { "log_filename": log_file, "tuner": "random", "n_trial": n_trial, "early_stopping": 60, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), ), } return tuning_opt
def _autotune_kernel(self, autokernel): params = self.test_parameters task = autotvm.task.create(autokernel, args=self._kernel_args(autokernel), target='llvm') TestSuite._write_to_infofile( autokernel, TestSuite._config_space_info(task.config_space), 'w') print(task.config_space) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=params.variance_resistance_runs)) tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=params.trial_runs, measure_option=measure_option, callbacks=[ autotvm.callback.log_to_file( TestSuite._logfile_path(autokernel)) ])
def tune_and_evaluate(): parser = argparse.ArgumentParser(description='Run conv2d benchmarks in TVM') parser.add_argument( '-b','--benchmark', help="Int. Number of Tensor Contraction benchmark (1-4)", default=1, type=int) parser.add_argument( '-f','--feature', help="Type of feature to use, one of 'datavol', 'itervar', 'datavol_itervar', 'itervar_silent_dv'", default='itervar', type=str) parser.add_argument( '-n','--num_iters', help="Int. number of times to run training", default=1, type=int) parser.add_argument( '-t','--trials', help="Int. Number of trials to sample", default=2000, type=int) parser.add_argument( '-l','--likwid_event', help='Likwid event to capture during training', default=None) parser.add_argument( '-r','--random', help="Use XGB+SA to select samples, or randomly select", default=False, action='store_true') parser.add_argument( '-k','--key_id', help="Key ID for RPC server.", default=None, type=str) parser.add_argument('--sa_num_iters', help="Number of iterations of simulated annealing", default=500, type=int) parser.add_argument('--no_save_features', help="Should save features", default=False, action='store_true') args = parser.parse_args() trials = args.trials ind = args.benchmark global matmul_index for size in [1000,4000]: #for ind in range(1,37): #for ind in [1,16,18,22,24,28,30,33,34,35,36,2]: matmul_index = ind print("Tuning TC %i..." % matmul_index) M,N,K = [size,size,size] tuning_option = { 'tuner': 'xgb', 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10, n_parallel=80), runner=autotvm.LocalRunner(repeat=10,number=4,), ), } print("M,N,K" , M,N,K) tune_kernels(args, M,N,K, trials, **tuning_option)
tuning_option = { 'log_filename': log_file, 'tuner': 'xgb', 'n_trial': 500, 'early_stopping': 100, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder( build_func='ndk' if use_android else 'default'), runner=autotvm.LocalRunner(number=10, repeat=1, timeout=4, min_repeat_ms=150), # runner=autotvm.RPCRunner( # device_key, host='localhost', port=9190, # number=10, # timeout=5, # ), ), } #################################################################### # # .. note:: How to set tuning options # # In general, the default values provided here work well. # If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,