def test_meta_schedule_local_runner_time_out(): """Test meta schedule Local Runner time out""" mod = MatmulModule builder = LocalBuilder() (builder_result,) = builder.build([BuilderInput(mod, Target("llvm"))]) assert builder_result.artifact_path is not None assert builder_result.error_msg is None runner_input = RunnerInput( builder_result.artifact_path, "llvm", [ TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), ], ) def initializer(): @register_func("meta_schedule.runner.test_time_out") def timeout_session_creator( # pylint: disable=unused-variable device: Device, # pylint: disable=unused-argument args_info: T_ARG_INFO_JSON_OBJ_LIST, # pylint: disable=unused-argument alloc_repeat: int, # pylint: disable=unused-argument ) -> RPCSession: time.sleep(2) evaluator_config = EvaluatorConfig( number=1, repeat=1, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner = LocalRunner( timeout_sec=1, evaluator_config=evaluator_config, initializer=initializer, f_alloc_argument="meta_schedule.runner.test_time_out", ) # Run the module (runner_future,) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result.error_msg is not None and runner_result.error_msg.startswith( "LocalRunner: Timeout, killed after" ) assert runner_result.run_secs is None _clean_build(builder_result.artifact_path)
def verify_meta_schedule_with_tensorrt( mod, params, data_shape, use_trt: bool = True, ): # Build builder = LocalBuilder( f_build=build_relay_with_tensorrt if use_trt else build_relay, timeout_sec=1000, ) builder_input = BuilderInput(mod, Target("cuda"), params) builder_result = builder.build([builder_input])[0] assert builder_result.error_msg is None, builder_result.error_msg assert builder_result.artifact_path is not None # Run runner_input = RunnerInput( builder_result.artifact_path, device_type="cuda", args_info=[TensorInfo("float32", data_shape)], ) runner = LocalRunner( evaluator_config=EvaluatorConfig( number=5, repeat=2, min_repeat_ms=0, enable_cpu_cache_flush=False, ), f_run_evaluator=run_with_graph_executor, ) # Run the module runner_future = runner.run([runner_input])[0] runner_result = runner_future.result() assert runner_result is not None assert runner_result.error_msg is None, runner_result.error_msg assert runner_result.run_secs is not None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0
def test_meta_schedule_local_single_run(): """Test meta schedule local runner for a single run""" # Build the module mod = MatmulModule builder = LocalBuilder() (builder_result, ) = builder.build([BuilderInput(mod, Target("llvm"))]) assert builder_result.artifact_path is not None assert builder_result.error_msg is None runner_input = RunnerInput( builder_result.artifact_path, "llvm", [ TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), ], ) evaluator_config = EvaluatorConfig( number=1, repeat=1, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner = LocalRunner(timeout_sec=100, evaluator_config=evaluator_config) # Run the module (runner_future, ) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result.error_msg is None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0 _clean_build(builder_result.artifact_path)
def test_meta_schedule_evolutionary_search( ): # pylint: disable = invalid-name] num_trials_per_iter = 10 num_trials_total = 100 strategy = EvolutionarySearch( num_trials_per_iter=num_trials_per_iter, num_trials_total=num_trials_total, population_size=5, init_measured_ratio=0.1, init_min_unmeasured=50, genetic_num_iters=3, genetic_mutate_prob=0.5, genetic_max_fail_count=10, eps_greedy=0.9, ) context = TuneContext( mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul), mutator_probs={ DummyMutator(): 1.0, }, target=tvm.target.Target("llvm"), num_threads=1, # because we are using a mutator from the python side ) _scheduler = RoundRobin( tasks=[context], builder=LocalBuilder(), runner=LocalRunner(), database=DummyDatabase(), cost_model=RandomModel(), measure_callbacks=[], ) context.space_generator.initialize_with_tune_context(context) spaces = context.space_generator.generate_design_space(context.mod) strategy.initialize_with_tune_context(context) strategy.pre_tuning(spaces) (correct_sch, ) = ScheduleFn( sch_fn=_schedule_matmul).generate_design_space(Matmul) num_trials_each_iter: List[int] = [] candidates = strategy.generate_measure_candidates() while candidates is not None: num_trials_each_iter.append(len(candidates)) runner_results: List[RunnerResult] = [] for candidate in candidates: _is_trace_equal( candidate.sch, correct_sch, remove_decisions=(isinstance(strategy, ReplayTrace)), ) runner_results.append( RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None)) strategy.notify_runner_results(context, candidates, runner_results) candidates = strategy.generate_measure_candidates() strategy.post_tuning() print(num_trials_each_iter) correct_count = 10 # For each iteration except the last one assert num_trials_each_iter == [correct_count] * ( num_trials_total // correct_count) + ( [num_trials_total % correct_count] if num_trials_total % correct_count != 0 else []) del _scheduler
def test_meta_schedule_evolutionary_search( ): # pylint: disable = invalid-name] @derived_object class DummyMutator(PyMutator): """Dummy Mutator for testing""" def initialize_with_tune_context(self, context: "TuneContext") -> None: pass def apply(self, trace: Trace, _) -> Optional[Trace]: return Trace(trace.insts, {}) @derived_object class DummyDatabase(PyDatabase): """Dummy Database for testing""" def __init__(self): super().__init__() self.records = [] self.workload_reg = [] def has_workload(self, mod: IRModule) -> bool: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return True return False def commit_tuning_record(self, record: TuningRecord) -> None: self.records.append(record) def commit_workload(self, mod: IRModule) -> Workload: for workload in self.workload_reg: if tvm.ir.structural_equal(workload.mod, mod): return workload workload = Workload(mod) self.workload_reg.append(workload) return workload def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]: return list( filter( lambda x: x.workload == workload, sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)), ))[:int(top_k)] def __len__(self) -> int: return len(self.records) def print_results(self) -> None: print("\n".join([str(r) for r in self.records])) num_trials_per_iter = 10 num_trials_total = 100 strategy = EvolutionarySearch( num_trials_per_iter=num_trials_per_iter, num_trials_total=num_trials_total, population_size=5, init_measured_ratio=0.1, init_min_unmeasured=50, genetic_num_iters=3, genetic_mutate_prob=0.5, genetic_max_fail_count=10, eps_greedy=0.9, ) context = TuneContext( mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul), mutator_probs={ DummyMutator(): 1.0, }, target=tvm.target.Target("llvm"), num_threads=1, # because we are using a mutator from the python side ) _scheduler = RoundRobin( tasks=[context], builder=LocalBuilder(), runner=LocalRunner(), database=DummyDatabase(), cost_model=RandomModel(), measure_callbacks=[], ) context.space_generator.initialize_with_tune_context(context) spaces = context.space_generator.generate_design_space(context.mod) strategy.initialize_with_tune_context(context) strategy.pre_tuning(spaces) (correct_sch, ) = ScheduleFn( sch_fn=_schedule_matmul).generate_design_space(Matmul) num_trials_each_iter: List[int] = [] candidates = strategy.generate_measure_candidates() while candidates is not None: num_trials_each_iter.append(len(candidates)) runner_results: List[RunnerResult] = [] for candidate in candidates: _is_trace_equal( candidate.sch, correct_sch, remove_decisions=(isinstance(strategy, ReplayTrace)), ) runner_results.append( RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None)) strategy.notify_runner_results(context, candidates, runner_results) candidates = strategy.generate_measure_candidates() strategy.post_tuning() print(num_trials_each_iter) correct_count = 10 # For each iteration except the last one assert num_trials_each_iter == [correct_count] * ( num_trials_total // correct_count) + ( [num_trials_total % correct_count] if num_trials_total % correct_count != 0 else []) del _scheduler
def test_meta_schedule_local_runner_add_test(): """Test meta schedule local runner with add module""" def _check_correct_add(args_before: List[np.array], args_after: List[np.array]) -> None: a_before, b_before, c_before = args_before a_after, b_after, c_after = args_after c_before = a_before + b_before assert (a_before == a_after).all() assert (b_before == b_after).all() assert (c_before == c_after).all() def test_alloc_argument( device: Device, args_info: T_ARG_INFO_JSON_OBJ_LIST, # pylint: disable=unused-argument alloc_repeat: int, ) -> List[T_ARGUMENT_LIST]: global repeated_args_before # pylint: disable=global-variable-undefined, invalid-name repeated_args_before = [] repeated_args = local_default_alloc_argument(device, args_info, alloc_repeat) for args in repeated_args: repeated_args_before.append([arg.asnumpy() for arg in args]) return repeated_args def test_run_evaluator( rt_mod: Module, device: Device, evaluator_config: EvaluatorConfig, repeated_args: List[Any], ) -> List[float]: global repeated_args_before # pylint: disable=global-variable-undefined, invalid-name repeated_args_after = [] evaluator = rt_mod.time_evaluator( func_name=rt_mod.entry_name, dev=device, number=evaluator_config.number, repeat=evaluator_config.repeat, min_repeat_ms=evaluator_config.min_repeat_ms, f_preproc="cache_flush_cpu_non_first_arg" if evaluator_config.enable_cpu_cache_flush else "", ) repeated_costs: List[List[float]] = [] for args in repeated_args: device.sync() profile_result = evaluator(*args) repeated_costs.append(profile_result.results) repeated_args_after.append([arg.asnumpy() for arg in args]) costs = [ float(cost) for cost in itertools.chain.from_iterable(repeated_costs) ] for args_before, args_after in zip(repeated_args_before, repeated_args_after): _check_correct_add(args_before, args_after) del repeated_args_before return costs # Build the module mod = AddModule builder = LocalBuilder() (builder_result, ) = builder.build([BuilderInput(mod, Target("llvm"))]) assert builder_result.artifact_path is not None assert builder_result.error_msg is None runner_input = RunnerInput( builder_result.artifact_path, "llvm", [ TensorInfo("float32", [MATMUL_M]), TensorInfo("float32", [MATMUL_M]), TensorInfo("float32", [MATMUL_M]), ], ) evaluator_config = EvaluatorConfig( number=1, repeat=1, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner = LocalRunner( timeout_sec=100, evaluator_config=evaluator_config, f_alloc_argument=test_alloc_argument, f_run_evaluator=test_run_evaluator, ) # Run the module (runner_future, ) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result.error_msg is None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0 _clean_build(builder_result.artifact_path)
def test_meta_schedule_local_multiple_runs(): """Test meta schedule local runner for multiple runs""" # Build the module mods = [ MatmulModule, MatmulReluModule, BatchMatmulModule, ] builder = LocalBuilder() builder_inputs = [BuilderInput(mod, Target("llvm")) for mod in mods] builder_results = builder.build(builder_inputs) for builder_result in builder_results: assert builder_result.artifact_path is not None assert builder_result.error_msg is None args_infos = [ [ TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), ], [ TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), TensorInfo("float32", (MATMUL_N, MATMUL_N)), ], [ TensorInfo("float32", [16, MATMUL_M, MATMUL_M]), TensorInfo("float32", [16, MATMUL_M, MATMUL_M]), TensorInfo("float32", [16, MATMUL_M, MATMUL_M]), ], ] runner_inputs = [ RunnerInput(builder_results[i].artifact_path, "llvm", args_infos[i]) for i in range(len(mods)) ] evaluator_config = EvaluatorConfig( number=1, repeat=1, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner = LocalRunner(timeout_sec=100, evaluator_config=evaluator_config) # Run the module runner_futures = runner.run(runner_inputs) runner_results = [ runner_future.result() for runner_future in runner_futures ] for runner_result in runner_results: assert runner_result.error_msg is None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0 for builder_result in builder_results: _clean_build(builder_result.artifact_path)
def verify_meta_schedule_with_tensorrt( mod, params, data_shape, use_meta_sched: bool = True, use_trt: bool = True, mode: str = "vm" ): if use_meta_sched: # With meta_schedule dev = "cuda" # Build if use_trt: from tvm.meta_schedule.testing import relay_build_with_tensorrt builder = LocalBuilder(f_build=relay_build_with_tensorrt) else: def relay_build_without_tensorrt( mod: Module, target: Target, params: dict, ) -> List[BuilderResult]: return tvm.relay.build_module._build_module_no_factory(mod, "cuda", "llvm", params) builder = LocalBuilder(f_build=relay_build_without_tensorrt) builder_input = BuilderInput(mod, Target(dev, host="llvm"), params) (builder_result,) = builder.build([builder_input]) assert builder_result.error_msg is None assert builder_result.artifact_path is not None # Run evaluator_config = EvaluatorConfig( number=5, repeat=2, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner_input = RunnerInput( builder_result.artifact_path, "cuda", [TensorInfo("float32", data_shape)] ) def eval_func(rt_mod, device, evaluator_config, repeated_args): rt_mod = tvm.contrib.graph_executor.GraphModule(rt_mod["default"](device)) eval = rt_mod.module.time_evaluator( func_name="run", dev=device, number=evaluator_config.number, repeat=evaluator_config.repeat, min_repeat_ms=evaluator_config.min_repeat_ms, f_preproc="cache_flush_cpu_non_first_arg" if evaluator_config.enable_cpu_cache_flush else "", ) repeated_costs: List[List[float]] = [] for args in repeated_args: profile_result = eval(*args) repeated_costs.append(profile_result.results) costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)] return costs runner = LocalRunner( evaluator_config=evaluator_config, f_run_evaluator=eval_func, ) # Run the module (runner_future,) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result is not None assert runner_result.run_secs is not None assert runner_result.error_msg is None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0 else: # Without meta_schedule if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() else: with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda", params=params ).evaluate()