def test_papi(target, dev): target = tvm.target.Target(target) if str(target.kind) == "llvm": metric = "PAPI_FP_OPS" elif str(target.kind) == "cuda": metric = "cuda:::event:shared_load:device=0" else: pytest.skip(f"Target {target.kind} not supported by this test") mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev) report = vm.profile( [data], func_name="main", collectors=[ tvm.runtime.profiling.PAPIMetricCollector({dev: [metric]}) ], ) assert metric in str(report) csv = read_csv(report) assert metric in csv.keys() assert any([float(x) > 0 for x in csv[metric]])
def test_vm(target, dev): dtype = "float32" x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype=dtype) y = relay.var("y", shape=(relay.Any(), relay.Any()), dtype=dtype) mod = tvm.IRModule() mod["main"] = relay.Function([x, y], relay.add(x, y)) exe = relay.vm.compile(mod, target) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(28, 28).astype("float32") report = vm.profile(data, data, func_name="main") assert "fused_add" in str(report) assert "Total" in str(report) assert "AllocTensorReg" in str(report) assert "AllocStorage" in str(report) csv = read_csv(report) assert "Hash" in csv.keys() # Ops should have a duration greater than zero. assert all([ float(dur) > 0 for dur, name in zip(csv["Duration (us)"], csv["Name"]) if name[:5] == "fused" ]) # AllocTensor or AllocStorage may be cached, so their duration could be 0. assert all([ float(dur) >= 0 for dur, name in zip(csv["Duration (us)"], csv["Name"]) if name[:5] != "fused" ])
def test_vm(target, dev): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") assert "fused_nn_softmax" in report assert "Total time" in report
def test_basic(dev, target): mod, params = mlp.get_workload(batch_size=1) if not profiler_vm.enabled(): return exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 1, 28, 28).astype("float32") res = vm.profile(tvm.nd.array(data), func_name="main") assert "softmax" in str(res)
def test_report_serialization(): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, "llvm", params=params) vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu()) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") report2 = Report.from_json(report.json()) # equality on reports compares pointers, so we compare the printed results instead. assert str(report) == str(report2)
def test_basic(): mod, params = resnet.get_workload() if not profiler_vm.enabled(): return for target, dev in enabled_targets(): exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 3, 224, 224).astype("float32") res = vm.invoke("main", [data]) print("\n{}".format(vm.get_stat())) print("\n{}".format(vm.get_stat(False)))
def test_vm_reshape_and_copy(): target = "llvm" dev = tvm.gpu() x_np = np.random.uniform(size=(8, 16)).astype("float32") x = relay.var("x", shape=(8, 16), dtype="float32") y = relay.reshape(x, [-1, 4, 8]) mod = tvm.IRModule() mod["main"] = relay.Function([x], y) with tvm.transform.PassContext(opt_level=3): exec = relay.vm.compile(mod, "llvm") assert "reshape_tensor" in exec.bytecode vm = profiler_vm.VirtualMachineProfiler(exec, dev) vm.profile(tvm.nd.array(x_np))
def test_basic(): mod, params = resnet.get_workload() target = 'llvm' ctx = tvm.cpu() if not profiler_vm.enabled(): return exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, ctx) data = np.random.rand(1, 3, 224, 224).astype('float32') res = vm.invoke("main", [data]) print("\n{}".format(vm.get_stat())) print("\n{}".format(vm.get_stat(False)))
def test_vm(target, dev): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") assert "fused_nn_softmax" in str(report) assert "Total" in str(report) csv = read_csv(report) assert "Hash" in csv.keys() assert all([float(x) > 0 for x in csv["Duration (us)"]])
def test_rpc_vm(): server = rpc.Server(key="profiling") remote = rpc.connect("127.0.0.1", server.port, key="profiling") mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, "llvm", params=params) temp = utils.tempdir() path = temp.relpath("lib.tar") exe.mod.export_library(path) remote.upload(path) rexec = remote.load_module("lib.tar") vm = profiler_vm.VirtualMachineProfiler(rexec, remote.cpu()) report = vm.profile(tvm.nd.array(np.ones((1, 1, 28, 28), dtype="float32"), device=remote.cpu())) assert len(report.calls) > 0
def test_vm(target, dev): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, target, params=params) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") assert "fused_nn_softmax" in str(report) assert "Total" in str(report) f = StringIO(report.csv()) reader = csv.reader(f, delimiter=",") # force parsing for row in reader: pass
def test_json(): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, "llvm", params=params) vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu()) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") parsed = json.loads(report.json()) assert "device_metrics" in parsed assert "calls" in parsed assert "Duration (us)" in parsed["calls"][0] assert "microseconds" in parsed["calls"][0]["Duration (us)"] assert len(parsed["calls"]) > 0 for call in parsed["calls"]: assert isinstance(call["Name"]["string"], str) assert isinstance(call["Count"]["count"], int) assert isinstance(call["Duration (us)"]["microseconds"], float)
def test_vm(target, dev): dtype = "float32" x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype=dtype) y = relay.var("y", shape=(relay.Any(), relay.Any()), dtype=dtype) mod = tvm.IRModule() mod["main"] = relay.Function([x, y], relay.add(x, y)) exe = relay.vm.compile(mod, target) vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(28, 28).astype("float32") report = vm.profile(data, data, func_name="main") assert "fused_add" in str(report) assert "Total" in str(report) assert "AllocTensorReg" in str(report) assert "AllocStorage" in str(report) csv = read_csv(report) assert "Hash" in csv.keys() assert all([float(x) > 0 for x in csv["Duration (us)"]])
def test_report_serialization(): mod, params = mlp.get_workload(1) exe = relay.vm.compile(mod, "llvm", params=params) vm = profiler_vm.VirtualMachineProfiler(exe, tvm.cpu()) data = np.random.rand(1, 1, 28, 28).astype("float32") report = vm.profile(data, func_name="main") report2 = Report.from_json(report.json()) # Equality on reports compares pointers, so we compare the printed # results instead. # Use .table() instead of str(), because str() includes aggregate # and column summations whose values may be impacted by otherwise # negligible conversion errors. (2 occurrences / 3000 trials) assert report.table(aggregate=False, col_sums=False) == report2.table(aggregate=False, col_sums=False)
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, end_to_end: bool = False, options: dict = None, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug executor. end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate of how long running the model in production would take. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") with ExitStack() as stack: # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with microTVM. if device == "micro": if tvmc_package.type != "mlf": raise TVMCException( f"Model {tvmc_package.package_path} is not a MLF archive.") project_dir = get_project_dir(tvmc_package.project_dir) # This is guaranteed to work since project_dir was already checked when # building the dynamic parser to accommodate the project options, so no # checks are in place when calling GeneratedProject. project_ = project.GeneratedProject.from_directory( project_dir, options) else: if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF). " "MLF can only be used to run micro device ('--device micro')." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) elif device == "micro": # Remote RPC (running on a micro target) logger.debug("Running on remote RPC (micro target).") try: session = tvm.micro.Session(project_.transport()) stack.enter_context(session) except: raise TVMCException( "Could not open a session with the micro target.") else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() # Micro targets don't support uploading a model. The model to be run # must be already flashed into the micro target before one tries # to run it. Hence skip model upload for micro targets. if device != "micro": session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() elif device == "vulkan": dev = session.vulkan() elif device == "rocm": dev = session.rocm() elif device == "micro": dev = session.device lib = session.get_system_lib() else: assert device == "cpu" dev = session.cpu() if tvmc_package.type == "vm": assert inputs is not None, "vm runner requires inputs to be provided as a dict" input_tensor = {} for e, i in inputs.items(): input_tensor[e] = tvm.nd.array(i, dev) if profile: logger.debug("Creating vm with profile enabled.") exe = profiler_vm.VirtualMachineProfiler(lib, dev) res = exe.profile(**input_tensor, func_name="main") # This print is intentional print(res) else: exe = vm.VirtualMachine(lib, dev) exe_outputs = exe.invoke("main", **input_tensor) times = exe.benchmark( dev, **input_tensor, func_name="main", repeat=repeat, number=number, end_to_end=end_to_end, ) # Special handling if the output only has a single value if not isinstance(exe_outputs, list): exe_outputs = [exe_outputs] outputs = {} for i, val in enumerate(exe_outputs): output_name = "output_{}".format(i) outputs[output_name] = val.numpy() else: # TODO(gromero): Adjust for micro targets. if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: if device == "micro": logger.debug( "Creating runtime (micro) with profiling disabled.") module = tvm.micro.create_local_graph_executor( tvmc_package.graph, lib, dev) else: logger.debug("Creating runtime with profiling disabled.") module = executor.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) logger.debug("Collecting graph input shape and type:") shape_dict, dtype_dict = module.get_input_info() logger.debug("Graph input shape: %s", shape_dict) logger.debug("Graph input type: %s", dtype_dict) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional print(report) if device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. module.run() times = [] else: # Call the benchmarking function of the executor. # Optionally measure e2e data transfers from the # CPU to device memory overheads (e.g. PCIE # overheads if the device is a discrete GPU). if end_to_end: dev = session.cpu() times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end) logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)