tracker_host, int(tracker_port), timeout=10000) # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. reconfig_start = time.time() vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) reconfig_time = time.time() - reconfig_start print( "Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) # In simulation mode, host the RPC server locally. else: remote = rpc.LocalSession() # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) #################################### # Build the inference graph runtime. # ---------------------------------- # Using Darknet library load downloaded vision model and compile with Relay. # The compilation steps are: # # 1. Front end translation from Darknet into Relay module. # 2. Apply 8-bit quantization: here we skip the first conv layer, # and dense layer which will both be executed in fp32 on the CPU. # 3. Perform graph packing to alter the data layout for tensorization. # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
def test_rpc_remote_module(): # graph n = tvm.runtime.convert(102) A = te.placeholder((n, ), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") s = te.create_schedule(B.op) server0 = rpc.Server(key="x0") server1 = rpc.Server(key="x1") client = rpc.connect( "127.0.0.1", server0.port, key="x0", session_constructor_args=[ "rpc.Connect", "127.0.0.1", server1.port, "x1", False ], ) def check_remote(remote): temp = utils.tempdir() dev = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Download the file from the remote path_tar = temp.relpath("dev_lib.tar") f.export_library(path_tar) remote.upload(path_tar) local_download_path = temp.relpath("dev_lib.download.so") with open(local_download_path, "wb") as fo: fo.write(remote.download_linked_module("dev_lib.tar")) fupdated = tvm.runtime.load_module(local_download_path) a = tvm.nd.array( np.random.uniform(size=102).astype(A.dtype), tvm.cpu(0)) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), tvm.cpu(0)) fupdated(a, b) np.testing.assert_equal(b.numpy(), a.numpy() + 1) def check_minrpc(): if tvm.get_global_func("rpc.CreatePipeClient", allow_missing=True) is None: return # export to minrpc temp = utils.tempdir() runtime = Runtime("cpp", {"system-lib": True}) f = tvm.build(s, [A, B], "llvm", name="myadd", runtime=runtime) path_minrpc = temp.relpath("dev_lib.minrpc") f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable)) with pytest.raises(RuntimeError): rpc.PopenSession("filenotexist") # statrt the minrpc session. remote = tvm.rpc.PopenSession(path_minrpc) dev = remote.cpu(0) f1 = remote.system_lib() a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1) cost = time_f(a, b).mean np.testing.assert_equal(b.numpy(), a.numpy() + 1) # change to not executable os.chmod(path_minrpc, stat.S_IRUSR) with pytest.raises(RuntimeError): rpc.PopenSession(path_minrpc) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.testing.device_enabled("opencl"): print("Skip because opencl is not enabled") return temp = utils.tempdir() dev = remote.cl(0) s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[B].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl --host=llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) fhost(a, b) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) fhost(a, b) np.testing.assert_equal(b.numpy(), a.numpy() + 1) check_remote(rpc.LocalSession()) check_remote(client) check_minrpc()
def main(): # one line to get the model block = get_model('resnet18_v1', pretrained=True) # test model img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true' img_name = 'cat.png' img_path = download_testdata(img_url, img_name, module='data') image = Image.open(img_path).resize((224, 224)) # tvm specific data path # print(img_path) x = transform_image(image) # label number to word dict prepped with synset synset_url = ''.join([ 'https://gist.githubusercontent.com/zhreshold/', '4d0b62f3d01426887599d4f7ede23ee5/raw/', '596b27d23537e5a1b5751d2b0481ef172f58b539/', 'imagenet1000_clsid_to_human.txt' ]) synset_name = 'imagenet1000_clsid_to_human.txt' synset_path = download_testdata(synset_url, synset_name, module='data') with open(synset_path) as f: synset = eval(f.read()) # print(synset) # Port GLuon model to portable computational graph batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape shape_dict = {'data': x.shape} mod, params = relay.frontend.from_mxnet(block, shape_dict) # we want a probability so add a softmax operator func = mod["main"] func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) # compile the graph to run on RaspPi modelB local_demo = False if local_demo: target = tvm.target.create('llvm') else: target = tvm.target.arm_cpu('rasp3b') with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, target, params=params) # Save the library at local temporary directory. tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) # RPC server is running on the Rasp Pi. # Get the IP address of the Rasp Pi and connect to the machine to run the net compiled here with Relay. # obtain an RPC session from remote device. if local_demo: remote = rpc.LocalSession() else: # The following is my environment, change this to the IP address of your target device host = '192.168.0.10' port = 9090 remote = rpc.connect(host, port) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') # create the remote runtime module ctx = remote.cpu(0) module = runtime.create(graph, rlib, ctx) # set parameter (upload params to the remote device. This may take a while) module.set_input(**params) # set input data module.set_input('data', tvm.nd.array(x.astype('float32'))) # run module.run() # get output out = module.get_output(0) # get top1 result top1 = np.argmax(out.asnumpy()) print('TVM prediction top-1: {}'.format(synset[top1]))
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_model() tasks = autotvm.task.extract_from_program(func=relay_prog, params=params, ops=(tvm.relay.op.nn.conv2d, ), target=target, target_host=env.target_host) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: print("\t{}".format(tsk)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. # return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_rpc_remote_module(): if not tvm.runtime.enabled("rpc"): return server = rpc.Server("localhost") client = rpc.connect(server.host, server.port) # graph n = tvm.runtime.convert(1024) A = te.placeholder((n, ), name='A') B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = te.create_schedule(B.op) def check_remote(remote): if not tvm.runtime.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.runtime.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.runtime.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[B].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote(client) check_remote(rpc.LocalSession())
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, options: dict = None, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") with ExitStack() as stack: # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with microTVM. if device == "micro": if tvmc_package.type != "mlf": raise TVMCException( f"Model {tvmc_package.package_path} is not a MLF archive.") project_dir = get_project_dir(tvmc_package.project_dir) # This is guaranteed to work since project_dir was already checked when # building the dynamic parser to accommodate the project options, so no # checks are in place when calling GeneratedProject. project_ = project.GeneratedProject.from_directory( project_dir, options) else: if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF). " "MLF can only be used to run micro device ('--device micro')." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) elif device == "micro": # Remote RPC (running on a micro target) logger.debug("Running on remote RPC (micro target).") try: session = tvm.micro.Session(project_.transport()) stack.enter_context(session) except: raise TVMCException( "Could not open a session with the micro target.") else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() # Micro targets don't support uploading a model. The model to be run # must be already flashed into the micro target before one tries # to run it. Hence skip model upload for micro targets. if device != "micro": session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() elif device == "vulkan": dev = session.vulkan() elif device == "rocm": dev = session.rocm() elif device == "micro": dev = session.device lib = session.get_system_lib() else: assert device == "cpu" dev = session.cpu() # TODO(gromero): Adjust for micro targets. if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: if device == "micro": logger.debug( "Creating runtime (micro) with profiling disabled.") module = tvm.micro.create_local_graph_executor( tvmc_package.graph, lib, dev) else: logger.debug("Creating runtime with profiling disabled.") module = runtime.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) logger.debug("Collecting graph input shape and type:") shape_dict, dtype_dict = module.get_input_info() logger.debug("Graph input shape: %s", shape_dict) logger.debug("Graph input type: %s", dtype_dict) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional print(report) if device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. module.run() times = [] else: # call the benchmarking function of the executor times = module.benchmark(dev, number=number, repeat=repeat) logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote( env.TARGET, tracker_host, tracker_port, timeout=10000 ) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program( mod, params=params, ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host, ) # filter out non-packed conv2d task tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print( "({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride, ) ) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): lib = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): lib = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.GraphModule(lib["default"](ctx)) # upload parameters to device image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) m.set_input("data", image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) )
def run_module( module_file, device, hostname=None, port=9090, rpc_key=None, inputs=None, fill_mode="random", repeat=1, profile=False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- module_file : str The path to the module file (a .tar file). device: str, the device (e.g. "cpu" or "gpu") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ with tempfile.TemporaryDirectory() as tmp_dir: logger.debug("extracting module file %s", module_file) t = tarfile.open(module_file) t.extractall(tmp_dir) graph = open(os.path.join(tmp_dir, "mod.json")).read() params = bytearray( open(os.path.join(tmp_dir, "mod.params"), "rb").read()) if hostname: # Remote RPC if rpc_key: logger.debug("running on remote RPC tracker with key %s", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("running on remote RPC with no key") session = rpc.connect(hostname, port) else: # Local logger.debug("running a local session") session = rpc.LocalSession() session.upload(os.path.join(tmp_dir, "mod.so")) lib = session.load_module("mod.so") # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("device is %s", device) if device == "gpu": dev = session.gpu() elif device == "cl": dev = session.cl() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("creating runtime with profiling enabled") module = debug_executor.create(graph, lib, dev, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") module = runtime.create(graph, lib, dev) logger.debug("load params into the runtime module") module.load_params(params) shape_dict, dtype_dict = get_input_info(graph, params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("setting inputs to the module") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.debug("running the module with profiling enabled") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, 1, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("collecting the output tensors") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).asnumpy() return outputs, times
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug runtime. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it." ) # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with µTVM. if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF)." "MLF can only be used to run micro targets (µTVM)." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() else: assert device == "cpu" dev = session.cpu() if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: logger.debug("Creating runtime with profiling disabled.") module = runtime.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") module.run() # create the module time evaluator (returns a function) timer = module.module.time_evaluator("run", dev, number=number, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results times = prof_result.results logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
def __init__(self, *args): # Parse arguments print("\nML element I/O spec: \n", args, '\n') model_path = args[0] input_shapes = util.shapes_str_to_npshapes(args[1]) input_types = util.datatypes_str_to_nptypes(args[2]) output_shapes = util.shapes_str_to_npshapes(args[3]) output_types = util.datatypes_str_to_nptypes(args[4]) input_names = util.names_str_to_strarray(args[5]) output_names = util.names_str_to_strarray(args[6]) self.input_shapes = input_shapes for input_type in input_types: if input_type is None: print("Invalid input_type") return None for output_type in output_types: if output_type is None: print("Invalid output_type") return None if (len(input_shapes) > 4 or len(input_types) > 4 or len(input_names) > 4 or len(input_shapes) != len(input_types) or len(input_shapes) != len(input_names)): print("Invalid input count: (%d,%d,%d)".format( len(input_shapes), len(input_types), len(input_names))) return None if (len(output_shapes) > 4 or len(output_types) > 4 or len(output_names) > 4 or len(output_shapes) != len(output_types) or len(output_shapes) != len(output_names)): print("Invalid output count: (%d,%d,%d)".format( len(output_shapes), len(output_types), len(output_names))) return None self.input_dims = [] self.output_dims = [] self.input_types = input_types self.output_types = output_types for i, input_shape in enumerate(input_shapes): input_dim = nns.TensorShape(input_shape, input_types[i]) self.input_dims.append(input_dim) for i, output_shape in enumerate(output_shapes): output_dim = nns.TensorShape(output_shape, output_types[i]) self.output_dims.append(output_dim) self.input_names = input_names self.output_names = output_names # Initialize TVM runtime session with given binary session = rpc.LocalSession() session.upload(os.path.join(model_path, "mod.so")) lib = session.load_module("mod.so") ctx = session.cpu() # TODO: Hardcoded CPU backend # Load graph and create a module self.graph = open(os.path.join(model_path, "mod.json")).read() self.module = runtime.create(self.graph, lib, ctx) self.ctx = ctx # Load params self.params = bytearray( open(os.path.join(model_path, "mod.params"), "rb").read()) self.module.load_params(self.params) return None
def main(model, start_pack, stop_pack, data_shape=(1, 3, 224, 224), dtype='float32'): # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") ###################################################################### # Define the platform and model targets # ------------------------------------- # Execute on CPU vs. VTA, and define the model. # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. ###################################################################### # Obtain an execution remote # --------------------------------- # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. print(f"Target is {env.TARGET}") if env.TARGET in ["sim", "tsim"]: remote = rpc.LocalSession() else: print(f"Error, incorrect target for benchmarking: {env.TARGET}") # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### # Build the inference graph runtime # --------------------------------- # Grab ResNet-18 model from Gluon model zoo and compile with Relay. # The compilation steps are: # 1) Front end translation from MxNet into Relay module. # 2) Apply 8-bit quantization: here we skip the first conv layer, # and dense layer which will both be executed in fp32 on the CPU. # 3) Perform graph packing to alter the data layout for tensorization. # 4) Perform constant folding to reduce number of operators (e.g. eliminate # batch norm multiply). # 5) Perform relay build to object file. # 6) Load the object file onto remote (FPGA device). # 7) Generate graph runtime, `m`. # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {"data": 'float32'} shape_dict = {"data": data_shape} # Measure build start time build_start = time.time() # Start front end compilation if model == 'resnet': mod, params = test_resnet_mxnet(env) elif model == 'yolo': mod, params = test_yolo_darknet() elif model == 'lenet': mod, params = lenet() elif model == 'mobilenet': mod, params = mobilenet() else: print(f"Error, incorrect model name: {model}") ### Need to bind params # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) print(f"Finishing quantizing graph") # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) print(f"Finishing packing graph") # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Graph runtime m = graph_runtime.create(graph, lib, ctx) # # # Set the network parameters and inputs data = np.random.uniform(size=data_shape).astype(dtype) m.set_input(**params) m.set_input('data', tvm.nd.array(data.astype(dtype))) # Perform inference and gather execution statistics # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator num = 1 # number of times we run module for a single measurement rep = 1 # number of measurements (we derive std dev from this) timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() timer() sim_stats = simulator.stats() print("\nExecution statistics:") for k, v in sim_stats.items(): # Since we execute the workload many times, we need to normalize stats # Note that there is always one warm up run # Therefore we divide the overall stats by (num * rep + 1) print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) else: tcost = timer() std = np.std(tcost.results) * 1000 mean = tcost.mean * 1000 print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) print("Average per sample inference time: %.2fms" % (mean / env.BATCH))