def test_extern_opt(): def Optimize(mod): return relay.transform.FoldConstant()(mod) tvm.register_func("relay.ext.test_target.optimize", Optimize) x = relay.var("x", shape=(2, 2)) y0 = relay.var("y0", shape=(2, 2)) y1 = relay.var("y1", shape=(2, 2)) yy0 = relay.annotation.compiler_begin(y0, "test_target") yy1 = relay.annotation.compiler_begin(y1, "test_target") z = yy0 + yy1 end = relay.annotation.compiler_end(z, "test_target") f = relay.Function([x, y0, y1], end * x) c = np.ones(shape=(2, 2), dtype="float32") f = bind_params_by_name(f, {"y0": tvm.nd.array(c), "y1": tvm.nd.array(c)}) mod = tvm.IRModule() mod["main"] = f mod = transform.InferType()(mod) mod = transform.PartitionGraph()(mod) try: t0 = mod["test_target_0"] except: raise KeyError("test_target_0 not found") assert isinstance(t0.body, relay.Constant) expected = np.empty([2, 2]) expected.fill(2) tvm.testing.assert_allclose(t0.body.data.asnumpy(), expected, rtol=1e-5, atol=1e-5)
def mk_primitive_op(self, func: Expr, args, output_type) -> Expr: cc_key = compile_engine.CCacheKey(func, self.tgt) hash = tvm.ir.structural_hash(func) name = f"op_{hash}" if not get_global_func(name, allow_missing=True): jit_func = self.engine.jit(cc_key, self.tgt) register_func(name, jit_func) return PackedCall(name, args, [x.checked_type for x in args], output_type)
def main_compute(code_only=False): tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('templates.' + (os.environ['OP'] if 'OP' in os.environ else 'auto.generic')) if verbose: print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) task = autotvm.task.create("template_op", args=(), target=tvm_target) def json_to_config(json_dict, index=-1, code_hash=None): if not isinstance(json_dict, list): json_list = [] for key in json_dict: json_list.append([key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key]]) json_dict = json_list config = ConfigEntity.from_json_dict({"index": index, "time": "", "code_hash": code_hash, "entity": json_dict}) # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict}) return config def config_to_json(config): if config is None: return {} if isinstance(config, str): return json.loads(config) jobj = config.to_json_dict()['entity'] # jobj = config.to_json_dict()['e'] json_dict = dict() for i in range(len(jobj)): assert(jobj[i][1] in ['sp', 'ot', 're']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': best_config = config elif 'NNI_TRIAL_JOB_ID' in os.environ: if os.environ['NNI_TRIAL_JOB_ID'] == '@': search_space = get_search_space(task.config_space) json_space = json.dumps(search_space) dump_to_file='./search_space.json' print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space)) with open("search_space.json", "w") as fp: fp.write(json_space) sys.exit(0) try: import nni params_given = nni.get_next_parameter() if params_given is None: raise local_dir_id = os.environ['NNI_TRIAL_JOB_ID'] except: params_given = default_tune_op.get_choice_example() local_dir_id = '_' t = run_config_entity(params_given, local_dir_id) gflops = compute_gflops(task.flop, t) print('[Antares-engine] Final entity result is: %g' % gflops) try: nni.report_final_result(gflops) except: print('[Antares-engine] (not reporting final result to NNI.)') exit(0) elif num_trials > 0: dev_num = platform_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor try: if platform_config.allow_concurrent_compile_execution(): raise Exception() worker_size = 1 except: worker_size = batch_size thread_pool = ThreadPoolExecutor(max_workers=worker_size) task.antares_helper = Mock() task.antares_helper.json_to_config = json_to_config task.antares_helper.config_to_json = config_to_json task.antares_helper.to_json_search_space = get_search_space tuner_type = os.environ.get('TUNER', '') if not tuner_type: comp = os.environ['COMPUTE_V1'] if '=!' in comp and 'plan/' not in comp[comp.find(' ##') + 1:] and ';' not in comp and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']: tuner_type = 'AutoTVM2' else: tuner_type = 'XGBoost' print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception("Saved code has existed in codehub. Please try COMMIT=force to override it.") os.environ.pop('COMMIT') try: tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if tuner is not None: AntaresGlobal.current_step = 0 def measure_batch(inputs): results, futures = [], [] best_slot = -1 expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): futures.append(thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), AntaresGlobal.current_step + i + 1, expected_timecost, i % dev_num)) for i in range(len(inputs)): t = futures[i].result() if t < tuner.task.best.timecost: best_slot = AntaresGlobal.current_step + i + 1 tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = best_slot results.append(autotvm.measure.MeasureResult(costs=(t,), error_no=0, all_cost=i, timestamp=time.time())) AntaresGlobal.current_step += len(results) print('\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % ( AntaresGlobal.current_step, num_trials, json.dumps(config_to_json(tuner.task.best.config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), compute_mem_ratio(tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, best_slot)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float('inf') tuner.task.best.config = None tuner.task.best.occur = -1 tuner.measure_batch = measure_batch tuner.measure_batch.n_parallel = batch_size callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append(autotvm.callback.log_to_file(history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print(' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history(autotvm.record.load_from_file(history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=batch_size), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4) ), callbacks=callbacks) assert not math.isinf(tuner.task.best.timecost), "Not valid config found in the whole tuning." best_config = json.dumps(config_to_json(tuner.task.best.config)) if auto_commit: device_source = codehub_db(os.environ['COMPUTE_V1']) codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step) print("\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % ( best_config, compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) if hasattr(tuner, 'cleanup'): tuner.cleanup() else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: if os.environ['OP'] == 'auto.generic': saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print("===========================") print(saved_code) print("===========================") exit(0) best_config = '' assert isinstance(best_config, str) if verbose: print("====>> [Current Config Option]", best_config) if best_config.startswith('['): from tvm import auto_scheduler origin_cfg = json.loads(best_config) origin_cfg = { "i": [['["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % ( device_properties().max_threads_per_block, device_properties().warp_size )], origin_cfg], "r": [[0], 0, 0, 0], "v": "v0.2", } origin_cfg_file = local_get_dir_file('my_kernel.cfg') with open(origin_cfg_file, 'w') as fp: fp.write(json.dumps(origin_cfg)) origin_cfg = tvm.auto_scheduler.measure_record.load_records(origin_cfg_file) @auto_scheduler.register_workload def auto_template(): _, arg_bufs = default_tune_op.get_template_op() return arg_bufs target = tvm.target.Target("cuda") auto_task = auto_scheduler.create_task(auto_template, (), target) for inp, res in origin_cfg: s, arg_bufs = auto_task.compute_dag.apply_steps_from_state(inp.state) break else: config = json_to_config(json.loads(best_config)) if best_config else task.config_space with ApplyConfig(config): with tvm.target.Target(tvm_target): s, arg_bufs = default_tune_op.get_template_op() device_source, kernel_path = get_target_source(s, arg_bufs) if code_only: return device_source if verbose: print("====================================") print(device_source) print("====================================\n") dev_id = int(os.environ.get('DEV_KEY', '0')) result = evaluate_perf(kernel_path, task.flop, dev_id) exit(0 if result is not None else 1)
def create_op_call(self, op: Function, relay_args, py_args): """Lowers the passed primitive function, registers it in TVM's global compiler, and produces a call to the lowered function in the generated Python code.""" # compile the function and register globally cc_key = compile_engine.CCacheKey(op, self.tgt) func_hash = tvm.ir.structural_hash(op) op_name = "_lowered_op_{}".format(func_hash) if not tvm.get_global_func(op_name, allow_missing=True): jitted = self.engine.jit(cc_key, self.tgt) tvm.register_func(op_name, jitted) def convert_input(py_input, arg_type): """Use the types of the function arguments to determine whether we expect a tensor or tuple (returns list of inputs to the lowered op call)""" # equivalent: input.data if isinstance(arg_type, relay.TensorType): return [py_input] assert isinstance(arg_type, relay.TupleType) # convert each input.fields[i] ret = [] for i in range(len(arg_type.fields)): ret += convert_input( ast.Subscript(py_input, ast.Index(Num(i)), Load()), arg_type.fields[i]) return ret def convert_output(ret_type): """Use the function return type to produce auxiliary variables to store outputs. Returns ([assignments of output vars], [extra arguments to pass to op call], expression collecting output)""" if isinstance(ret_type, relay.TensorType): output_var_name = self.generate_var_name("_out") output_var = Name(output_var_name, Load()) shape = ast.Tuple( [Num(dim) for dim in ret_type.concrete_shape], Load()) # create a new NDArray of the right shape and dtype assign_output = Assign( [Name(output_var_name, Store())], self.create_call("nd.array", [ self.create_call("numpy.empty", [shape, Str(ret_type.dtype)]) ]), ) return ([assign_output], [output_var], output_var) assert isinstance(ret_type, relay.TupleType) assignments = [] extra_args = [] fields = [] for t in ret_type.fields: inner_assignments, inner_args, inner_output = convert_output(t) assignments += inner_assignments extra_args += inner_args fields.append(inner_output) fields = [ast.List(fields, Load())] return (assignments, extra_args, self.create_call("_container.tuple_object", fields)) # create a function to wrap the call of the lowered op and return # a call to that function wrap_name = self.generate_function_name("_{}_wrapper".format(op_name)) wrap_args = [ self.generate_var_name("_arg_{}".format(i)) for i in range(len(py_args)) ] inner_call_args = [] for i in range(len(py_args)): inner_call_args += convert_input(Name(wrap_args[i], Load()), relay_args[i].checked_type) output_assignments, aux_args, output = convert_output( op.checked_type.ret_type) # equiv: _op = tvm.get_global_func(op_name) op_var = self.generate_var_name("_op") op_call = self.create_call("tvm.get_global_func", [Str(op_name)]) op_assign = Assign([Name(op_var, Store())], op_call) # equiv: _op(args) inner_call = self.create_call(op_var, inner_call_args + aux_args) body = output_assignments + [ op_assign, ast.Expr(inner_call), Return(output) ] wrap_def = self.create_def(wrap_name, wrap_args, body) return wrap_def, self.create_call(wrap_name, py_args)
def main_compute(code_only=False): tvm_target = 'cuda' tvm.register_func('tvm_callback_cuda_compile', compile_source, override=True) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('templates.' + ( os.environ['OP'] if 'OP' in os.environ else 'auto.generic')) print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) task = autotvm.task.create("template_op", args=(), target=tvm_target) def json_to_config(json_dict, index=-1, code_hash=None): if not isinstance(json_dict, list): json_list = [] for key in json_dict: json_list.append([ key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key] ]) json_dict = json_list config = ConfigEntity.from_json_dict({ "index": index, "time": "", "code_hash": code_hash, "entity": json_dict }) # config = ConfigEntity.from_json_dict({"i": index, "t": "", "c": code_hash, "e": json_dict}) return config def config_to_json(config): if config is None: return {} if isinstance(config, str): return json.loads(config) jobj = config.to_json_dict()['entity'] # jobj = config.to_json_dict()['e'] json_dict = dict() for i in range(len(jobj)): assert (jobj[i][1] in ['sp', 'ot', 're']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': if config[0] != '[': params_given = json.loads(config) print("====>> [Current Config Option]", config) best_config = json_to_config(params_given) else: best_config = config elif 'NNI_TRIAL_JOB_ID' in os.environ: if os.environ['NNI_TRIAL_JOB_ID'] == '@': search_space = get_search_space(task.config_space) json_space = json.dumps(search_space) dump_to_file = './search_space.json' print("\n>> Writing Search Space to '%s', Search Space = %s;" % (dump_to_file, json_space)) with open("search_space.json", "w") as fp: fp.write(json_space) sys.exit(0) try: import nni params_given = nni.get_next_parameter() if params_given is None: raise local_dir_id = os.environ['NNI_TRIAL_JOB_ID'] except: params_given = default_tune_op.get_choice_example() local_dir_id = '_' t = run_config_entity(params_given, local_dir_id) gflops = compute_gflops(task.flop, t) print('[Antares-engine] Final entity result is: %g' % gflops) try: nni.report_final_result(gflops) except: print('[Antares-engine] (not reporting final result to NNI.)') exit(0) elif num_trials > 0: dev_num = platform_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor try: if platform_config.allow_concurrent_compile_execution(): raise Exception() worker_size = 1 except: worker_size = batch_size thread_pool = ThreadPoolExecutor(max_workers=worker_size) task.antares_helper = Mock() task.antares_helper.json_to_config = json_to_config task.antares_helper.config_to_json = config_to_json task.antares_helper.to_json_search_space = get_search_space tuner_type = os.environ.get('TUNER', 'XGBoost') print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception( "Saved code has existed in codehub. Please try COMMIT=force to overide it." ) os.environ.pop('COMMIT') try: tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if tuner is not None: def measure_batch(inputs): results, futures = [], [] best_slot = -1 expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): futures.append( thread_pool.submit(run_config_entity, config_to_json(inputs[i].config), i, expected_timecost, i % dev_num)) for i in range(len(inputs)): t = futures[i].result() if t < tuner.task.best.timecost: best_slot = i tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = tuner.task.best.curr_step + i + 1 results.append( autotvm.measure.MeasureResult(costs=(t, ), error_no=0, all_cost=i, timestamp=time.time())) tuner.task.best.curr_step += len(results) print( '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, Occur Step = %d;' % (tuner.task.best.curr_step, num_trials, json.dumps(config_to_json(tuner.task.best.config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db( os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run; Step Produced = %d;' % (t, tuner.task.best.curr_step)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float('inf') tuner.task.best.config = None tuner.task.best.occur = -1 tuner.task.best.curr_step = 0 tuner.measure_batch = measure_batch callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append( autotvm.callback.log_to_file( history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print( ' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history( autotvm.record.load_from_file( history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, measure_option=autotvm.measure_option( builder=autotvm.LocalBuilder(n_parallel=batch_size), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)), callbacks=callbacks) assert not math.isinf( tuner.task.best.timecost ), "Not valid config found in the whole tuning." best_config = tuner.task.best.config print( "\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (json.dumps(config_to_json(best_config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) if hasattr(tuner, 'cleanup'): tuner.cleanup() else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: if os.environ['OP'] == 'auto.generic': saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print("===========================") print(saved_code) print("===========================") exit(0) best_config = task.config_space if isinstance(best_config, str): from tvm import auto_scheduler origin_cfg = json.loads(best_config) origin_cfg = { "i": [[ '["main_compute.<locals>.auto_template"]', 'cuda -keys=cuda,gpu -max_num_threads=%d -thread_warp_size=%d' % (device_properties().max_threads_per_block, device_properties().warp_size) ], origin_cfg], "r": [[0], 0, 0, 0], "v": "v0.2", } origin_cfg_file = local_get_dir_file('my_kernel.cfg') with open(origin_cfg_file, 'w') as fp: fp.write(json.dumps(origin_cfg)) origin_cfg = tvm.auto_scheduler.measure_record.load_records( origin_cfg_file) @auto_scheduler.register_workload def auto_template(): _, arg_bufs = default_tune_op.get_template_op() return arg_bufs target = tvm.target.Target("cuda") auto_task = auto_scheduler.create_task(auto_template, (), target) for inp, res in origin_cfg: s, arg_bufs = auto_task.compute_dag.apply_steps_from_state( inp.state) break else: with ApplyConfig(best_config): with tvm.target.Target(tvm_target): s, arg_bufs = default_tune_op.get_template_op() if s is not None: lower_source = str(tvm.lower(s, arg_bufs, simple_mode=True)) lower_file = local_get_dir_file('my_kernel.lower') with open(lower_file, 'w') as fp: fp.write(lower_source) # Verify Lower Code Code if len(('\n' + lower_source).split('\nprimfn(')) != 2: raise Exception('[Not Support Multi Unfuse-able kernels]\n\n' + lower_source) max_threads_per_block = device_properties().max_threads_per_block max_shared_memory_per_block = device_properties( ).max_shared_memory_per_block assert max_threads_per_block > 0 and max_shared_memory_per_block >= 0, '[Error] Invalid device properties, maybe device is not detected correctly.' lower_lines = lower_source.split('\n') thread_extents, allocate_shared = [], [] for ll in lower_lines: if ll.strip().startswith( 'attr [IterVar(') and ll.find(' "thread_extent" = ') >= 0: thread_name = ll.split('attr [IterVar(')[-1].split(':')[0] thread_val = int( ll.split(' "thread_extent" = ')[-1].split(';') [0].strip().split(' ')[0]) thread_extents.append((thread_name, thread_val)) elif ll.strip().startswith('allocate(') and ll.find( '.shared, ') >= 0 and ll.endswith(");"): parts = ll[:-2].split(', ')[1:] allocate_type = parts[0] allocate_val = int(np.product(eval(parts[1]))) allocate_shared.append((allocate_type, allocate_val)) reserved_axes = dict() for thread_name, thread_val in thread_extents: if thread_name in reserved_axes: assert reserved_axes[ thread_name] == thread_val, "Invalid code: Multiple hints for thread extent conflict with each other: %d v.s. %d" % ( reserved_axes[thread_name], thread_val) else: reserved_axes[thread_name] = thread_val num_threads = 1 for thread_name in ['threadIdx.x', 'threadIdx.y', 'threadIdx.z']: num_threads *= reserved_axes.get(thread_name, 1) assert num_threads <= max_threads_per_block, "Invalid kernel code: using num_threads(%d) > max_threads_per_block(%d)" % ( num_threads, max_threads_per_block) shared_memory_in_bytes = 0 for allocate_type, allocate_size in allocate_shared: if allocate_type.startswith('custom['): type_name = allocate_type[7:].split(']')[0] shared_memory_inc = int( custom_dtypes[type_name][-1].split('@')[-1]) else: shared_memory_inc = 8 * np.dtype(allocate_type).itemsize assert shared_memory_inc % 8 == 0, "The bits of shared_memory is not aligned with 8-bit bytes." shared_memory_in_bytes += shared_memory_inc // 8 * allocate_size if shared_memory_in_bytes > max_shared_memory_per_block: raise Exception( "Invalid kernel code: using shared_memory_in_bytes %d > max_shared_memory_per_block %d" % (shared_memory_in_bytes, max_shared_memory_per_block)) # Compile Source Code def build_template(): return tvm.build(s, arg_bufs, tvm_target, name='template_op') func = wait_for(build_template, 30) assert (len(func.imported_modules) == 1) device_source = translate_code(func.imported_modules[0].get_source()) if code_only: return device_source print("====================================") print(device_source) print("====================================") print() try: eval_client = importlib.import_module('platforms.%s.evaluator.client' % backend) except ModuleNotFoundError: print('>> Evaluator for backend %s not found, skipping evaluation.' % backend) exit(0) except: traceback.print_exc() exit(1) def handle_result(result): print('\n[EvalAgent] Results =', json.dumps(result)) if 'RESULT' in os.environ: if abs(float(os.environ['RESULT']) / result['K/0'] - 1.0) > 1e-6: result['TPR'] = None t = result.get('TPR', None) if t is None: print("\n[Antares] Incorrect compute kernel from evaluator.") else: gflops = compute_gflops(task.flop, t) print("\n[Antares] Average time cost / run = %g sec, %g gflops." % (t, gflops)) with open(local_get_dir_file('result.txt'), 'w') as fp: fp.write(str(t) + '\n') if 'K/0' in result: fp.write(str(result['K/0']) + '\n') if os.environ['OP'] == 'auto.generic' and os.environ.get('COMMIT', ''): kernel_path = codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Saved Perf = %g sec / run' % t) print(' >> Update current code to codehub: %s' % kernel_path) tune_slot_id = int(os.environ.get(unified_slot_key, '0')) exec_fd, _ = system_lock([tune_slot_id]) try: expected_timeout = None if 'EXPECTED_TIMEOUT' in os.environ and not math.isinf( float(os.environ['EXPECTED_TIMEOUT'])): expected_timeout = float(os.environ['EXPECTED_TIMEOUT']) expected_timeout = max(expected_timeout * 1.1, expected_timeout + 0.1) results = eval_client.eval( kernel_path=local_get_dir_file('my_kernel.cc'), expected_timeout=expected_timeout, func=func, ) except: traceback.print_exc() exit(1) handle_result(results) exec_fd() exit(0)
def server_init_callback(): # pylint: disable=redefined-outer-name, reimported, import-outside-toplevel, import-self import tvm import vta.exec.rpc_server tvm.register_func("tvm.rpc.server.start", vta.exec.rpc_server.server_start, override=True)
def main_compute(code_only=False): def compile_callback(code): return bytearray() tvm.register_func('tvm_callback_cuda_compile', compile_callback, override=True) default_tune_op = importlib.import_module('lang.generic') import logging from tvm import autotvm logging.getLogger('autotvm').setLevel(logging.ERROR) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) task = autotvm.task.create("template_op", args=(), target=tvm_target) AntaresGlobal.default_tune_op = default_tune_op AntaresGlobal.default_task = task if verbose: print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': best_config = config elif num_trials > 0: dev_num = backend_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor worker_size = batch_size if batch_size < dev_num else dev_num thread_pool = ThreadPoolExecutor(max_workers=worker_size) tuner_type = os.environ.get('TUNER') if not tuner_type: explicit_ops = AntaresGlobal.attrs.explicit_ops tuner_type = 'OpEvo' print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s\n' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception( "Saved code has existed in codehub. Please try COMMIT=force to override it." ) os.environ.pop('COMMIT') try: task.search_space_v2 = AntaresGlobal.attrs.auto_config.get_config_space( ) task.n_parallel = batch_size tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if hasattr(tuner, 'cleanup'): AntaresGlobal.cleanup_funcs.append(tuner.cleanup) if tuner is not None: AntaresGlobal.current_step = 0 eval_client.init(backend_root=backend_root) def measure_batch(inputs): results, futures = [], [] target_sources, config_strs = [], [] for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 config_str = inputs[i].config if type( inputs[i].config).__name__ == 'str' else 'null' config_strs.append(config_str) try: target_source = get_target_source( config_strs[i], dir_sid) except: # traceback.print_exc() target_source = None target_sources.append(target_source) expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 futures.append( thread_pool.submit(run_config_entity, target_sources[i], config_strs[i], dir_sid, expected_timecost, i % dev_num)) best_slot = -1 for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 t = futures[i].result() if t < tuner.task.best.timecost: best_slot = dir_sid tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = best_slot results.append( autotvm.measure.MeasureResult(costs=(t, ), error_no=0, all_cost=i, timestamp=time.time())) AntaresGlobal.current_step += len(results) print( '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g sec / op (%g Gflops), MemRatio = %g %%, Occur Step = %d;\n' % (AntaresGlobal.current_step, num_trials, tuner.task.best.config, tuner.task.best.timecost, compute_gflops(tuner.task.flop, tuner.task.best.timecost), compute_mem_ratio( tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db( os.environ['COMPUTE_V1'], source_code=device_source + code_suffix( tpr=t, step_prod=best_slot, step_plan=num_trials)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float( os.environ.get('EXPECTED_TIMEOUT', 'inf')) tuner.task.best.config = None tuner.task.best.occur = -1 tuner.measure_batch = measure_batch tuner.measure_batch.n_parallel = batch_size callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append( autotvm.callback.log_to_file( history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print( ' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history( autotvm.record.load_from_file( history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, callbacks=callbacks, measure_option=None) if math.isinf(tuner.task.best.timecost): print( f'[Error] No valid config found in the whole tuning. (Try other tuner types other than `TUNER={tuner_type}`?)' ) cleanup_on_exit(0, None) best_config = tuner.task.best.config if auto_commit: device_source = codehub_db(os.environ['COMPUTE_V1']) codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step) print( "\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (best_config, compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) cleanup_on_exit(-1, None) else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print( "// ---------------------------------------------------------------------------" ) print(saved_code) print( "// ---------------------------------------------------------------------------" ) exit(0) best_config = '' assert isinstance(best_config, str) best_config = best_config if best_config else 'null' device_source, kernel_path = get_target_source(best_config) if code_only: return device_source if verbose: print() print( "// ---------------------------------------------------------------------------" ) print(device_source) print( "// ---------------------------------------------------------------------------" ) eval_client.init(backend_root=backend_root) dev_id = int(os.environ.get('DEV_ID', '0')) result = evaluate_perf(kernel_path, dev_id, device_source) exit(0 if result is not None and len(result) > 1 else 1)
def main_compute(code_only=False): def compile_callback(code): return bytearray() tvm.register_func('tvm_callback_cuda_compile', compile_callback, override=True) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) default_tune_op = importlib.import_module('lang.generic') task = autotvm.task.create("template_op", args=(), target=tvm_target) def json_to_config(json_dict, index=-1, code_hash=None): if not isinstance(json_dict, list): json_list = [] for key in json_dict: json_list.append([ key, 'ot' if type(json_dict[key]) is not list else ('sp' if json_dict[key][0:1] == [-1] else 're'), json_dict[key] ]) json_dict = json_list config = ConfigEntity.from_json_dict({ "index": index, "time": "", "code_hash": code_hash, "entity": json_dict }) return config def config_to_json(config): if config is None: return {} if isinstance(config, str): return json.loads(config) jobj = config.to_json_dict()['entity'] json_dict = dict() for i in range(len(jobj)): assert (jobj[i][1] in ['sp', 'ot', 're']) json_dict[jobj[i][0]] = jobj[i][2] return json_dict task.antares_helper = Mock() task.antares_helper.json_to_config = json_to_config task.antares_helper.config_to_json = config_to_json task.antares_helper.to_json_search_space = get_search_space AntaresGlobal.default_tune_op = default_tune_op AntaresGlobal.default_task = task if verbose: print(' >> Backend = %s, Python PID = %s, Task = %s;' % (backend, os.getpid(), default_tune_op.__name__)) num_trials = int(os.environ['STEP']) if 'STEP' in os.environ else 0 config = os.environ.get('CONFIG', '').strip() if config != '': best_config = config elif os.environ.get('NNI_TRIAL_JOB_ID', '') == '@': search_space = get_search_space(task.config_space) json_space = json.dumps(search_space) print("\n>> Search Space: %s" % (json_space)) exit(0) elif num_trials > 0: dev_num = platform_config.get_execution_parallism() if dev_num <= 0: raise Exception("No valid device found for backend: %s." % backend) batch_size = int(os.environ.get('BATCH', '16')) from concurrent.futures import ThreadPoolExecutor try: if platform_config.allow_concurrent_compile_execution(): raise Exception() worker_size = 1 except: worker_size = batch_size thread_pool = ThreadPoolExecutor(max_workers=worker_size) tuner_type = os.environ.get('TUNER', '') if not tuner_type: explicit_ops = AntaresGlobal.attrs.explicit_ops global_outs = get_global_arg_props()['_out'] if ('|plan/' not in ('|' + '|'.join(AntaresGlobal.attrs.options)) and len(explicit_ops) == 1 and len(explicit_ops[-1].reduce_axis) > 0 and len(global_outs) == 1 and global_outs[0]['name'] == explicit_ops[-1].name and backend in ['c-rocm', 'c-cuda', 'c-hlsl', 'c-ocl']): tuner_type = 'Ansor' else: tuner_type = 'XGBoost' print(' >> MAKE_PARA = %d/%d, EXEC_PARA = %d, TUNER = %s' % (worker_size, batch_size, dev_num, tuner_type)) auto_commit = os.environ.get('COMMIT', '') if auto_commit: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None and auto_commit != 'force': raise Exception( "Saved code has existed in codehub. Please try COMMIT=force to override it." ) os.environ.pop('COMMIT') try: task.n_parallel = batch_size tuner = importlib.import_module('tuner.%s.main' % tuner_type) tuner = tuner.MainTuner(task) except: raise Exception('>> Cannot import Antares Tuner: %s' % tuner_type) if hasattr(tuner, 'cleanup'): AntaresGlobal.cleanup_funcs.append(tuner.cleanup) if tuner is not None: AntaresGlobal.current_step = 0 def measure_batch(inputs): results, futures = [], [] target_sources, config_strs = [], [] for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 config_strs.append( json.dumps(config_to_json(inputs[i].config))) try: target_source = get_target_source( config_strs[i], dir_sid) except: # traceback.print_exc() target_source = None target_sources.append(target_source) expected_timecost = tuner.task.best.timecost for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 futures.append( thread_pool.submit(run_config_entity, target_sources[i], config_strs[i], dir_sid, expected_timecost, i % dev_num)) best_slot = -1 for i in range(len(inputs)): dir_sid = AntaresGlobal.current_step + i + 1 t = futures[i].result() if t < tuner.task.best.timecost: best_slot = dir_sid tuner.task.best.timecost = t tuner.task.best.config = inputs[i].config tuner.task.best.occur = best_slot results.append( autotvm.measure.MeasureResult(costs=(t, ), error_no=0, all_cost=i, timestamp=time.time())) AntaresGlobal.current_step += len(results) print( '\nSTEP[%d / %d] Current Best Config = %s, Perf = %g Gflops, MemRatio = %g %%, Occur Step = %d;' % (AntaresGlobal.current_step, num_trials, json.dumps(config_to_json(tuner.task.best.config)), compute_gflops(tuner.task.flop, tuner.task.best.timecost), compute_mem_ratio( tuner.task.best.timecost), tuner.task.best.occur)) if auto_commit and best_slot >= 0: with open(local_get_dir_file('my_kernel.cc', best_slot), 'r') as fp: device_source = fp.read() with open(local_get_dir_file('result.txt', best_slot), 'r') as fp: t = float(fp.read().split()[0]) kernel_path = codehub_db( os.environ['COMPUTE_V1'], source_code=device_source + code_suffix( tpr=t, step_prod=best_slot, step_plan=num_trials)) print(' >> Update current code to codehub: %s' % kernel_path) return results tuner.task.best = Mock() tuner.task.best.timecost = float('inf') tuner.task.best.config = None tuner.task.best.occur = -1 tuner.measure_batch = measure_batch tuner.measure_batch.n_parallel = batch_size callbacks = [] history_log_for_transfer_learning = os.environ.get('RECORD', '') if history_log_for_transfer_learning: callbacks.append( autotvm.callback.log_to_file( history_log_for_transfer_learning)) # Enable Transfer Learning for Incremental Task if os.path.exists(history_log_for_transfer_learning): print( ' >> Loading incremental history from log file: %s ..' % history_log_for_transfer_learning) tuner.load_history( autotvm.record.load_from_file( history_log_for_transfer_learning)) tuner.tune(n_trial=num_trials, callbacks=callbacks, measure_option=None) if math.isinf(tuner.task.best.timecost): print( f'[Error] valid config found in the whole tuning. (Try other tuner types other than `TUNER={tuner_type}`?)' ) cleanup_on_exit(0, None) best_config = json.dumps(config_to_json(tuner.task.best.config)) if auto_commit: device_source = codehub_db(os.environ['COMPUTE_V1']) codehub_db(os.environ['COMPUTE_V1'], source_code=device_source + '\n// Antares Tuning Completed in %d steps.' % AntaresGlobal.current_step) print( "\n[Best Config] CONFIG='%s' ==> Performance is up to %f Gflops, occurred at step %d / %d; time per run = %g sec." % (best_config, compute_gflops(tuner.task.flop, tuner.task.best.timecost), tuner.task.best.occur, num_trials, tuner.task.best.timecost)) cleanup_on_exit(-1, None) else: raise Exception('Unrecognized tuner type: `%s`' % tuner_type) exit(0) else: saved_code = codehub_db(os.environ['COMPUTE_V1']) if saved_code is not None: print(" >> Using Saved Code from Codehub:") print("===========================") print(saved_code) print("===========================") exit(0) best_config = '' assert isinstance(best_config, str) best_config = best_config if best_config else task.config_space device_source, kernel_path, compile_args = get_target_source(best_config) if code_only: return device_source if verbose: print("====================================") print(device_source) print("====================================\n") do_compilation(compile_args) dev_id = int(os.environ.get('DEV_KEY', '0')) result = evaluate_perf(kernel_path, dev_id, device_source) exit(0 if result is not None and len(result) > 1 else 1)