def test_correctness_layout_rewrite_insert_transform_stage(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner=measure_ctx.runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state( inp.state, layout_rewrite=auto_scheduler.compute_dag.ComputeDAG. InsertTransformStage) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state) np_args = [ np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs ] func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3) del measure_ctx
def __init__(self, task, **kwargs): self.task = task self.measure_ctx = auto_scheduler.LocalRPCMeasureContext( min_repeat_ms=300) @auto_scheduler.register_workload def auto_template(): _, arg_bufs = task.func() return arg_bufs self.auto_task = auto_scheduler.create_task(auto_template, (), task.target)
def get_sample_records(number): """Generate a list of random MeasureInput and MeasureResult pairs""" N = 128 task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), "llvm") policy = auto_scheduler.SketchPolicy(task, verbose=0) states = policy.sample_initial_population()[:number] inputs = [auto_scheduler.MeasureInput(task, s) for s in states] results = [ auto_scheduler.MeasureResult([np.random.uniform(0.5, 1.0)], 0, "", 0.1, 0) for _ in range(len(inputs)) ] return task, inputs, results
def test_task_scheduler_round_robin(): tasks = [] for n in [2, 4, 8]: tasks.append( auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm")) def objective_func(costs): return sum(costs) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name num_trials_per_task = 2 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=num_trials_per_task * len(tasks), runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin") task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the result of round robin counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, res in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 for task in tasks: assert counters[task.workload_key] == num_trials_per_task # test continuous tuning (restoring the status) task_scheduler = auto_scheduler.TaskScheduler(tasks, objective_func, strategy="round-robin", load_log_file=log_file) tune_option = auto_scheduler.TuningOptions( num_measure_trials=len(tasks), num_measures_per_round=1, ) task_scheduler.tune(tune_option, search_policy="sketch.random") del measure_ctx
def test_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm") for enable_cpu_cache_flush in [True, False]: minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) local_builder = auto_scheduler.LocalBuilder() local_runner = auto_scheduler.LocalRunner( timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush ) bress = local_builder.build([minp]) assert bress[0].error_no == 0 mress = local_runner.run([minp], bress) assert mress[0].error_no == 0
def test_mutate_parallel(): """ The test case initializes evo search with a batch of "bad" states and check whether the search algorithm can find "good" states by mutating the "bad" states. """ class MockCostModel(PythonBasedModel): @staticmethod def is_good_state(state): for line in str(state).split("\n"): if (line.find("parallel i.0@ (0") != -1 or line.find("parallel [email protected]@ (0") != -1 or line.find("parallel [email protected]@i.1@ (0") != -1): return True return False def predict(self, task, states): scores = [] for state in states: scores.append(1 if self.is_good_state(state) else 0) return scores task = auto_scheduler.create_task(matmul_auto_scheduler_test, (1024, 1024, 1024), "llvm") policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0) found = False retry_ct = 0 while retry_ct < 10 and not found: states = policy.sample_initial_population()[:100] bad_states = [] for state in states: if not MockCostModel.is_good_state(state): bad_states.append(state) new_states = policy.evolutionary_search(bad_states, 50) for state in new_states: if MockCostModel.is_good_state(state): found = True break retry_ct += 1 assert found
def test_task_scheduler_gradient(): tasks = [] for n in [2, 4]: tasks.append( auto_scheduler.create_task(matmul_auto_scheduler_test, (n, n, n), "llvm")) def objective_func(costs): return costs[0] with tempfile.NamedTemporaryFile() as fp: log_file = fp.name n_trials = 5 # Tune all tasks measure_ctx = auto_scheduler.LocalRPCMeasureContext() tune_option = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, num_measures_per_round=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task_scheduler = auto_scheduler.TaskScheduler( tasks, objective_func=objective_func) # Forcely rewrite the initial values. # This can make this test more stable on the slow CI machines task_scheduler.best_costs = np.array([1e2, 1e-8]) task_scheduler.tune(tune_option, search_policy="sketch.random") # Check the allocation results counters = {} for task in tasks: counters[task.workload_key] = 0 for inp, res in auto_scheduler.load_records(log_file): counters[inp.task.workload_key] += 1 assert counters[tasks[0].workload_key] == n_trials - 1 assert counters[tasks[1].workload_key] == 1 del measure_ctx
def test_recover_measure_input(): task = auto_scheduler.create_task(matmul_auto_scheduler_test, [512, 512, 512], "llvm") inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) with tempfile.NamedTemporaryFile() as fp: auto_scheduler.save_records(fp.name, [inp], [res]) log_reader = auto_scheduler.RecordReader(fp.name) inputs, results = log_reader.read_lines() assert len(inputs) == 1 raw_inp = inputs[0] correct_inp = auto_scheduler.measure_record.recover_measure_input(raw_inp) assert str(correct_inp.task.compute_dag) == str(inp.task.compute_dag) correct_inp = auto_scheduler.measure_record.recover_measure_input( raw_inp, rebuild_state=True ) assert str(correct_inp.state) == str(inp.state)
k = te.reduce_axis((0, L), name="k") matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul") out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out") return [A, B, C, out] ###################################################################### # Create the search task # ^^^^^^^^^^^^^^^^^^^^^^ # We then create a search task with N=L=M=128 and dtype="float32" target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target) # Inspect the computational graph print(task.compute_dag) ###################################################################### # Next, we set parameters for the auto-scheduler. # # * `num_measure_trials` is the number of measurement trials we can use during the search. # We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a # good value for the search to converge. You can do more trials according to your time budget. # * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. # * see :any:`auto_schedule.TuningOptions`: for more parameters
def search_common( workload=matmul_auto_scheduler_test, target="llvm", search_policy="empty", seed=random.randint(1, 1 << 30), runner="local", cost_model=auto_scheduler.RandomModel(), num_measure_trials=10, init_search_callbacks=None, ): print("Test %s schedule search with the default search policy" % (target)) random.seed(seed) N = 128 target = tvm.target.Target(target) task = auto_scheduler.create_task(workload, (N, N, N), target) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name init_search_callbacks = init_search_callbacks or [] init_search_callbacks.append( auto_scheduler.PreloadMeasuredStates(log_file)) if search_policy == "empty": search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == "sketch": search_policy = auto_scheduler.SketchPolicy( task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks) tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, runner=runner, verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options) print("*" * 80) print(target) print("*" * 80) inp, res = auto_scheduler.load_best(log_file, task.workload_key, target) print("==== Python Code ====") print(task.compute_dag.print_python_code_from_state(inp.state)) try: print("==== Lowered Stmt ====") print(tvm.lower(sch, args, simple_mode=True)) mod = tvm.build(sch, args, target) ctx = tvm.context(str(target), 0) dtype = task.compute_dag.tensors[0].dtype a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) mod(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) print("==== Verification passed ====") except Exception: raise Exception("Error encountered with seed: %d" % (seed)) print()
def test_layout_rewrite_correctness(): N = 128 target = tvm.target.Target("llvm") task = auto_scheduler.create_task(matmul_auto_scheduler_test, (N, N, N), target) dag = task.compute_dag with tempfile.NamedTemporaryFile() as fp: log_file = fp.name search_policy = auto_scheduler.SketchPolicy(task) tuning_options = auto_scheduler.TuningOptions( num_measure_trials=2, runner="local", verbose=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) auto_scheduler.auto_schedule(task, search_policy, tuning_options) inp, _ = auto_scheduler.load_best(log_file, task.workload_key, target) s, bufs = dag.apply_steps_from_state(inp.state, layout_rewrite=True) s_ref, bufs_ref = dag.apply_steps_from_state(inp.state, layout_rewrite=False) np_args = [ np.random.randn(*topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs ] np_args_ref = [np.array(x) for x in np_args] weight = np_args_ref[1] # infer shape for the rewritten layout if len(weight.shape) >= 6: # For cpu tile structure SSRSRS base = len(weight.shape) - 6 red_dim = weight.shape[2 + base] * weight.shape[4 + base] out_dim = weight.shape[3 + base] * weight.shape[5 + base] for i in range(base + 2): out_dim *= weight.shape[i] new_order = ([ 2 + base, 4 + base, ] + list(range(base + 2)) + [ 3 + base, 5 + base, ]) np_args_ref[1] = np_args_ref[1].transpose(new_order) np_args_ref[1] = np_args_ref[1].reshape((red_dim, out_dim)) func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) ctx = tvm.context(str(target)) ctx_ref = tvm.cpu() args = [tvm.nd.array(x, ctx=ctx) for x in np_args] args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref] ctx.sync() func(*args) func_ref(*args_ref) ctx.sync() np.testing.assert_allclose(np_args[0], np_args_ref[0]) np.testing.assert_allclose(np_args[2], np_args_ref[2])
def main_compute(code_only=False): tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('templates.' + (os.environ['OP'] if 'OP' in os.environ else 'auto.generic')) if verbose: print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) task = autotvm.task.create("template_op", args=(), target=tvm_target) def json_to_config(json_dict, index=-1, code_hash=None): if not isinstance(json_dict, list): json_list = [] for key in json_dict: json_list.append([key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key]]) json_dict = json_list config = ConfigEntity.from_json_dict({"index": index, "time": "", "code_hash": code_hash, "entity": json_dict}) # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict}) return config def config_to_json(config): if config is None: return {} if isinstance(config, str): return json.loads(config) jobj = config.to_json_dict()['entity'] # jobj = config.to_json_dict()['e'] json_dict = dict() for i in range(len(jobj)): assert(jobj[i][1] in ['sp', 'ot', 're']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': best_config = config elif 'NNI_TRIAL_JOB_ID' in os.environ: if os.environ['NNI_TRIAL_JOB_ID'] == '@': search_space = get_search_space(task.config_space) json_space = json.dumps(search_space) dump_to_file='./search_space.json' print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space)) with open("search_space.json", "w") as fp: fp.write(json_space) sys.exit(0) try: import nni params_given = nni.get_next_parameter() if params_given is None: raise local_dir_id = os.environ['NNI_TRIAL_JOB_ID'] except: params_given = default_tune_op.get_choice_example() local_dir_id = '_' t = run_config_entity(params_given, local_dir_id) gflops = compute_gflops(task.flop, t) print('[Antares-engine] Final entity result is: %g' % gflops) try: nni.report_final_result(gflops) except: print('[Antares-engine] (not reporting final result to NNI.)') exit(0) elif num_trials > 0: dev_num = platform_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor try: if platform_config.allow_concurrent_compile_execution(): raise Exception() worker_size = 1 except: worker_size = batch_size thread_pool = ThreadPoolExecutor(max_workers=worker_size) task.antares_helper = Mock() task.antares_helper.json_to_config = json_to_config task.antares_helper.config_to_json = config_to_json task.antares_helper.to_json_search_space = get_search_space tuner_type = os.environ.get('TUNER', '') if not tuner_type: comp = os.environ['COMPUTE_V1'] if '=!' in comp and 'plan/' not in comp[comp.find(' ##') + 1:] and ';' not in comp and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']: tuner_type = 'AutoTVM2' else: tuner_type = 'XGBoost' print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception("Saved code has existed in codehub. Please try COMMIT=force to override it.") os.environ.pop('COMMIT') try: tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if tuner is not None: AntaresGlobal.current_step = 0 def measure_batch(inputs): results, futures = [], [] best_slot = -1 expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): futures.append(thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), AntaresGlobal.current_step + i + 1, expected_timecost, i % dev_num)) for i in range(len(inputs)): t = futures[i].result() if t < tuner.task.best.timecost: best_slot = AntaresGlobal.current_step + i + 1 tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = best_slot results.append(autotvm.measure.MeasureResult(costs=(t,), error_no=0, all_cost=i, timestamp=time.time())) AntaresGlobal.current_step += len(results) print('\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % ( AntaresGlobal.current_step, num_trials, json.dumps(config_to_json(tuner.task.best.config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), compute_mem_ratio(tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, best_slot)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float('inf') tuner.task.best.config = None tuner.task.best.occur = -1 tuner.measure_batch = measure_batch tuner.measure_batch.n_parallel = batch_size callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append(autotvm.callback.log_to_file(history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print(' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history(autotvm.record.load_from_file(history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=batch_size), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4) ), callbacks=callbacks) assert not math.isinf(tuner.task.best.timecost), "Not valid config found in the whole tuning." best_config = json.dumps(config_to_json(tuner.task.best.config)) if auto_commit: device_source = codehub_db(os.environ['COMPUTE_V1']) codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step) print("\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % ( best_config, compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) if hasattr(tuner, 'cleanup'): tuner.cleanup() else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: if os.environ['OP'] == 'auto.generic': saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print("===========================") print(saved_code) print("===========================") exit(0) best_config = '' assert isinstance(best_config, str) if verbose: print("====>> [Current Config Option]", best_config) if best_config.startswith('['): from tvm import auto_scheduler origin_cfg = json.loads(best_config) origin_cfg = { "i": [['["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % ( device_properties().max_threads_per_block, device_properties().warp_size )], origin_cfg], "r": [[0], 0, 0, 0], "v": "v0.2", } origin_cfg_file = local_get_dir_file('my_kernel.cfg') with open(origin_cfg_file, 'w') as fp: fp.write(json.dumps(origin_cfg)) origin_cfg = tvm.auto_scheduler.measure_record.load_records(origin_cfg_file) @auto_scheduler.register_workload def auto_template(): _, arg_bufs = default_tune_op.get_template_op() return arg_bufs target = tvm.target.Target("cuda") auto_task = auto_scheduler.create_task(auto_template, (), target) for inp, res in origin_cfg: s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(inp.state) break else: config = json_to_config(json.loads(best_config)) if best_config else task.config_space with ApplyConfig(config): with tvm.target.Target(tvm_target): s, arg_bufs = default_tune_op.get_template_op() device_source, kernel_path = get_target_source(s, arg_bufs) if code_only: return device_source if verbose: print("====================================") print(device_source) print("====================================\n") dev_id = int(os.environ.get('DEV_KEY', '0')) result = evaluate_perf(kernel_path, task.flop, dev_id) exit(0 if result is not None else 1)
# [[4, 512, 7, 7], [256, 512, 3, 3], [2, 2], [1, 1], [2, 2]], # [[1, 512, 14, 14], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [[1, 512, 28, 7], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [[1, 512, 7, 28], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [[2, 512, 7, 14], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [[2, 512, 14, 7], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [[4, 512, 7, 7], [256, 512, 3, 3], [1, 1], [1, 1], [1, 1]], # [512, 768, 768, 1], [512, 768, 768, 3], ] for input_task in input_tasks: M, N, K, B = input_task # task = auto_scheduler.create_task( # conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding, dilation), target) task = auto_scheduler.create_task(bmm_layer, (M, N, K, B), target) tasks.append(task) # for input_task in input_tasks: # [[N, CI, H, W], [CO, _, KH, KW], padding, strides, dilation] = input_task # assert(CI == _) # task = auto_scheduler.create_task( # conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding, dilation), target) # tasks.append(task) print('# of tasks = %d' % (len(tasks))) # Inspect the computational graph # print(task.compute_dag) ###################################################################### # Next, we set parameters for the auto-scheduler. These parameters
def main_compute(code_only=False): tvm_target = 'cuda' tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('templates.' + ( os.environ['OP'] if 'OP' in os.environ else 'auto.generic')) print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) task = autotvm.task.create("template_op", args=(), target=tvm_target) def json_to_config(json_dict, index=-1, code_hash=None): if not isinstance(json_dict, list): json_list = [] for key in json_dict: json_list.append([ key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key] ]) json_dict = json_list config = ConfigEntity.from_json_dict({ "index": index, "time": "", "code_hash": code_hash, "entity": json_dict }) # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict}) return config def config_to_json(config): if config is None: return {} if isinstance(config, str): return json.loads(config) jobj = config.to_json_dict()['entity'] # jobj = config.to_json_dict()['e'] json_dict = dict() for i in range(len(jobj)): assert (jobj[i][1] in ['sp', 'ot', 're']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': if config[0] != '[': params_given = json.loads(config) print("====>> [Current Config Option]", config) best_config = json_to_config(params_given) else: best_config = config elif 'NNI_TRIAL_JOB_ID' in os.environ: if os.environ['NNI_TRIAL_JOB_ID'] == '@': search_space = get_search_space(task.config_space) json_space = json.dumps(search_space) dump_to_file = './search_space.json' print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space)) with open("search_space.json", "w") as fp: fp.write(json_space) sys.exit(0) try: import nni params_given = nni.get_next_parameter() if params_given is None: raise local_dir_id = os.environ['NNI_TRIAL_JOB_ID'] except: params_given = default_tune_op.get_choice_example() local_dir_id = '_' t = run_config_entity(params_given, local_dir_id) gflops = compute_gflops(task.flop, t) print('[Antares-engine] Final entity result is: %g' % gflops) try: nni.report_final_result(gflops) except: print('[Antares-engine] (not reporting final result to NNI.)') exit(0) elif num_trials > 0: dev_num = platform_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor try: if platform_config.allow_concurrent_compile_execution(): raise Exception() worker_size = 1 except: worker_size = batch_size thread_pool = ThreadPoolExecutor(max_workers=worker_size) task.antares_helper = Mock() task.antares_helper.json_to_config = json_to_config task.antares_helper.config_to_json = config_to_json task.antares_helper.to_json_search_space = get_search_space tuner_type = os.environ.get('TUNER', 'XGBoost') print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception( "Saved code has existed in codehub. Please try COMMIT=force to overide it." ) os.environ.pop('COMMIT') try: tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if tuner is not None: def measure_batch(inputs): results, futures = [], [] best_slot = -1 expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): futures.append( thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), i, expected_timecost, i % dev_num)) for i in range(len(inputs)): t = futures[i].result() if t < tuner.task.best.timecost: best_slot = i tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = tuner.task.best.curr_step + i + 1 results.append( autotvm.measure.MeasureResult(costs=(t, ), error_no=0, all_cost=i, timestamp=time.time())) tuner.task.best.curr_step += len(results) print( '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, Occur Step = %d;' % (tuner.task.best.curr_step, num_trials, json.dumps(config_to_json(tuner.task.best.config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db( os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, tuner.task.best.curr_step)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float('inf') tuner.task.best.config = None tuner.task.best.occur = -1 tuner.task.best.curr_step = 0 tuner.measure_batch = measure_batch callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append( autotvm.callback.log_to_file( history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print( ' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history( autotvm.record.load_from_file( history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=batch_size), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)), callbacks=callbacks) assert not math.isinf( tuner.task.best.timecost ), "Not valid config found in the whole tuning." best_config = tuner.task.best.config print( "\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (json.dumps(config_to_json(best_config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) if hasattr(tuner, 'cleanup'): tuner.cleanup() else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: if os.environ['OP'] == 'auto.generic': saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print("===========================") print(saved_code) print("===========================") exit(0) best_config = task.config_space if isinstance(best_config, str): from tvm import auto_scheduler origin_cfg = json.loads(best_config) origin_cfg = { "i": [[ '["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % (device_properties().max_threads_per_block, device_properties().warp_size) ], origin_cfg], "r": [[0], 0, 0, 0], "v": "v0.2", } origin_cfg_file = local_get_dir_file('my_kernel.cfg') with open(origin_cfg_file, 'w') as fp: fp.write(json.dumps(origin_cfg)) origin_cfg = tvm.auto_scheduler.measure_record.load_records( origin_cfg_file) @auto_scheduler.register_workload def auto_template(): _, arg_bufs = default_tune_op.get_template_op() return arg_bufs target = tvm.target.Target("cuda") auto_task = auto_scheduler.create_task(auto_template, (), target) for inp, res in origin_cfg: s, arg_bufs = auto_task.compute_dag.apply_steps_from_state( inp.state) break else: with ApplyConfig(best_config): with tvm.target.Target(tvm_target): s, arg_bufs = default_tune_op.get_template_op() if s is not None: lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True)) lower_file = local_get_dir_file('my_kernel.lower') with open(lower_file, 'w') as fp: fp.write(lower_source) # Verify Lower Code Code if len(('\n' + lower_source).split('\nprimfn(')) != 2: raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' + lower_source) max_threads_per_block = device_properties().max_threads_per_block max_shared_memory_per_block = device_properties( ).max_shared_memory_per_block assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.' lower_lines = lower_source.split('\n') thread_extents, allocate_shared = [], [] for ll in lower_lines: if ll.strip().startswith( 'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0: thread_name = ll.split('attr [IterVar(')[-1].split(':')[0] thread_val = int( ll.split(' "thread_extent" = ')[-1].split(';') [0].strip().split(' ')[0]) thread_extents.append((thread_name, thread_val)) elif ll.strip().startswith('allocate(') and ll.find( '.shared, ') >= 0 and ll.endswith(");"): parts = ll[:-2].split(', ')[1:] allocate_type = parts[0] allocate_val = int(np.product(eval(parts[1]))) allocate_shared.append((allocate_type, allocate_val)) reserved_axes = dict() for thread_name, thread_val in thread_extents: if thread_name in reserved_axes: assert reserved_axes[ thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % ( reserved_axes[thread_name], thread_val) else: reserved_axes[thread_name] = thread_val num_threads = 1 for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']: num_threads *= reserved_axes.get(thread_name, 1) assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % ( num_threads, max_threads_per_block) shared_memory_in_bytes = 0 for allocate_type, allocate_size in allocate_shared: if allocate_type.startswith('custom['): type_name = allocate_type[7:].split(']')[0] shared_memory_inc = int( custom_dtypes[type_name][-1].split('@')[-1]) else: shared_memory_inc = 8 * np.dtype(allocate_type).itemsize assert shared_memory_inc % 8 == 0, "The bits of shared_memory is not aligned with 8-bit bytes." shared_memory_in_bytes += shared_memory_inc // 8 * allocate_size if shared_memory_in_bytes > max_shared_memory_per_block: raise Exception( "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d" % (shared_memory_in_bytes, max_shared_memory_per_block)) # Compile Source Code def build_template(): return tvm.build(s, arg_bufs, tvm_target, name='template_op') func = wait_for(build_template, 30) assert (len(func.imported_modules) == 1) device_source = translate_code(func.imported_modules[0].get_source()) if code_only: return device_source print("====================================") print(device_source) print("====================================") print() try: eval_client = importlib.import_module('platforms.%s.evaluator.client' % backend) except ModuleNotFoundError: print('>> Evaluator for backend %s not found, skipping evaluation.' % backend) exit(0) except: traceback.print_exc() exit(1) def handle_result(result): print('\n[EvalAgent] Results =', json.dumps(result)) if 'RESULT' in os.environ: if abs(float(os.environ['RESULT']) / result['K/0'] - 1.0) > 1e-6: result['TPR'] = None t = result.get('TPR', None) if t is None: print("\n[Antares] Incorrect compute kernel from evaluator.") else: gflops = compute_gflops(task.flop, t) print("\n[Antares] Average time cost / run = %g sec, %g gflops." % (t, gflops)) with open(local_get_dir_file('result.txt'), 'w') as fp: fp.write(str(t) + '\n') if 'K/0' in result: fp.write(str(result['K/0']) + '\n') if os.environ['OP'] == 'auto.generic' and os.environ.get('COMMIT', ''): kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run' % t) print(' >> Update current code to codehub: %s' % kernel_path) tune_slot_id = int(os.environ.get(unified_slot_key, '0')) exec_fd, _ = system_lock([tune_slot_id]) try: expected_timeout = None if 'EXPECTED_TIMEOUT' in os.environ and not math.isinf( float(os.environ['EXPECTED_TIMEOUT'])): expected_timeout = float(os.environ['EXPECTED_TIMEOUT']) expected_timeout = max(expected_timeout * 1.1, expected_timeout + 0.1) results = eval_client.eval( kernel_path=local_get_dir_file('my_kernel.cc'), expected_timeout=expected_timeout, func=func, ) except: traceback.print_exc() exit(1) handle_result(results) exec_fd() exit(0)
def create_auto_task(tvm_target): return auto_scheduler.create_task(auto_template, (), tvm_target)
bias = te.placeholder((1, CO, 1, 1), name="bias") conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") out = topi.nn.relu(conv + bias) return [data, kernel, bias, out] ###################################################################### # Create the search task # ^^^^^^^^^^^^^^^^^^^^^^ # We then create a search task for the last convolution layer in the resnet. target = tvm.target.Target("cuda") # Use the last layer in ResNet-50 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target) # Inspect the computational graph print(task.compute_dag) ###################################################################### # Next, we set parameters for the auto-scheduler. These parameters # mainly specify how we do the measurement during the search. # # * :code:`measure_ctx` launches a different process for measurement to # provide isolation. It can protect the master process from GPU crashes # during measurement and avoid other runtime conflicts. # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. # This can warmup the GPU, which is necessary to get accurate measurement results. # Typically, we recommend a value > 300 ms. # * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
def generate_sketches(workload_func, args, target, print_for_debug=False): task = auto_scheduler.create_task(workload_func, args, tvm.target.Target(target)) policy = auto_scheduler.SketchPolicy(task, verbose=0) return policy.generate_sketches(print_for_debug)