def test_dense_dense(): M, N, K = 128, 128, 128 data_shape = (M, K) weight_shape = (N, K) relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape)) data_np = np.random.randn(*data_shape).astype("float32") weight1_np = np.random.randn(*weight_shape).astype("float32") weight2_np = np.random.randn(*weight_shape).astype("float32") target = "llvm" params = {"weight1": weight1_np, "weight2": weight2_np} def schedule_fn(task, sch): if "nn_dense_nn_dense" in task.task_name: schedule_dense_dense(sch) return True return False database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) with ms.ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): lib = relay.build(relay_mod, target=target, params=params) dev = tvm.device(target, 0) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) runtime.set_input("data", data_np) runtime.run() out = runtime.get_output(0).numpy() ref = get_ref(data_np, weight1_np, weight2_np) tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
def tune_each_task( mod, target, config, runner, work_dir, params, ): extracted_tasks = ms.extract_task_from_relay(mod, target, params) database = ms.database.JSONDatabase( path_workload=os.path.join(work_dir, "default_database_workload.json"), path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"), ) for task in extracted_tasks: # pylint: disable=protected-access tune_context = ms.tune.Parse._tune_context( tune_context=None, mod=ms.tune.Parse._mod(task.dispatched[0]), target=target, config=config, task_name=task.task_name, space_generator=None, sch_rules=None, postprocs=None, mutator_probs=None, num_threads=os.cpu_count(), ) task_scheduler = ms.tune.Parse._task_scheduler( None, [tune_context], task_weights=[1.0], builder=ms.tune.Parse._builder(None), runner=ms.tune.Parse._runner(runner), database=database, max_trials=config.max_trials_per_task, cost_model=ms.tune.Parse._cost_model(None), measure_callbacks=ms.tune.Parse._callbacks(None), ) # pylint: enable=protected-access task_scheduler.tune() with target, ms.ApplyHistoryBest(database): with PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): return relay_build(mod, target=target, params=params)
def test_meta_schedule_integration_apply_history_best(): mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224]) database = ms.database.MemoryDatabase() env = ms.ApplyHistoryBest(database) target = Target("llvm") workload = database.commit_workload(MockModule) database.commit_tuning_record( ms.database.TuningRecord( trace=Schedule(MockModule).trace, workload=workload, run_secs=[1.0], target=target, args_info=[], )) mod = env.query( task_name="mock-task", mod=mod, target=target, dispatched=[MockModule], ) assert tvm.ir.structural_equal(mod, workload.mod)
def manual_tir_common(do_tune=False): M, N, K = 1024, 1024, 1024 # pylint: disable=invalid-name data_shape = (M, K) weight_shape = (N, K) data_dtype = "uint8" data = relay.var("data", shape=data_shape, dtype=data_dtype) weight = relay.var("weight", shape=weight_shape, dtype="int8") bias = relay.var("bias", shape=(weight_shape[0], ), dtype="int32") # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py) dense = relay.nn.dense(data, weight, out_dtype="int32") bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32") out = relay.nn.batch_matmul( relay.cast(relay.expand_dims(bias_add, 0), "uint8"), relay.cast(relay.expand_dims(bias_add, 0), "int8"), out_dtype="int32", ) relay_mod = tvm.IRModule.from_expr(out) target = "llvm -mcpu=cascadelake -num-cores 4" dev = tvm.device(target, 0) data = np.random.uniform(1, 10, size=(M, K)).astype("uint8") weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8") bias_np = np.random.uniform(1, 10, size=(weight_shape[0], )).astype("int32") ref = (relay.create_executor( "vm", mod=relay_mod, device=dev, target=target).evaluate()(*[data, weight_np, bias_np]).numpy()) params = {"weight": weight_np, "bias": bias_np} if do_tune: extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params) # Filter out tasks that we don't intend to schedule / tune with TIR. tune_tasks = list( filter( lambda task: "dense" in task.task_name, extracted_tasks, )) config = ms.TuneConfig( strategy="replay_trace", num_trials_per_iter=64, max_trials_per_task=20000, max_trials_global=20000, ) with tempfile.TemporaryDirectory() as work_dir: # postprocs=lambda: [] is important to prevent default post processors from # tampering with the manual schedule. database = ms.tune_extracted_tasks( tune_tasks, config, work_dir=work_dir, postprocs=lambda: [], ) else: def schedule_fn(task, sch): if "dense" not in task.task_name: return False block = sch.get_block("compute") # Looks up schedule_rule annotation. # See the comment in test_tune_relay_manual_tir_vnni(). schedule_rule = sch.get(block).annotations["schedule_rule"] assert "dense_vnni" in schedule_rule schedule_dense(block, M, False, sch) return True database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) with ms.ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): # pylint: disable=W0105 """ The log should say Warning: Cannot find workload: tvmgen_default_fused_expand_dims Warning: Cannot find workload: tvmgen_default_fused_cast Warning: Cannot find workload: tvmgen_default_fused_cast_1 Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul This means batch matmul and others are scheduled by TE, and dense (the one not warned) is found in the meta schedule tuning database during ApplyHistoryBest """ # pylint: enable=W0105 lib = relay.build(relay_mod, target=target, params=params) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) runtime.set_input("data", data) runtime.run() out = runtime.get_output(0).numpy() np.testing.assert_equal(out, ref)
def test_meta_schedule_relay_lowering(): data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(*data_shape).astype("float32") weight_sample = np.random.rand(*weight_shape).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} input_name = "data" dev = tvm.cpu() target = Target("llvm --num-cores=16") data = tvm.nd.array(data_sample, dev) with tempfile.TemporaryDirectory() as work_dir: database = ms.database.JSONDatabase( osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")) database.commit_tuning_record( ms.database.TuningRecord( Trace([], {}), database.commit_workload( tvmgen_default_fused_nn_contrib_conv2d_NCHWc), [0.0], target=target, args_info=[], )) with ms.ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def test_meta_schedule_te2primfunc_argument_order(): @ms.derived_object class TestDummyDatabase(ms.database.PyDatabase): def __init__(self): super().__init__() self.records = [] self.workload_reg = [] def has_workload(self, mod: IRModule) -> ms.database.Workload: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return True # The database has already put in all correct workloads raise ValueError( "The workload searched for is not in given database!" + " Incorrect TIR was generated from TE subgraph.") def commit_tuning_record(self, record: ms.database.TuningRecord) -> None: self.records.append(record) def commit_workload(self, mod: IRModule) -> ms.database.Workload: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return workload workload = ms.database.Workload(mod) self.workload_reg.append(workload) return workload def get_top_k( self, workload: ms.database.Workload, top_k: int, ) -> List[ms.database.TuningRecord]: return list( filter( lambda x: x.workload == workload, sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)), ))[:int(top_k)] def __len__(self) -> int: return len(self.records) def print_results(self) -> None: print("\n".join([str(r) for r in self.records])) data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(*data_shape).astype("float32") weight_sample = np.random.rand(*weight_shape).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} input_name = "data" dev = tvm.cpu() target = Target("llvm --num-cores=16") data = tvm.nd.array(data_sample, dev) database = TestDummyDatabase() database.commit_workload(tvmgen_default_fused_layout_transform) database.commit_workload(tvmgen_default_fused_layout_transform_1) database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc) with ms.ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)