def test_adt_list(): mod = relay.Module() p = Prelude(mod) l1 = p.cons(relay.const(1), p.nil()) l21 = p.cons(relay.const(2), l1) l321 = p.cons(relay.const(3), l21) f = relay.Function([], l321) mod["main"] = f exe = create_exec(mod) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) result = veval(des_vm) assert len(result) == 2 assert len(result[1]) == 2 assert len(result[1][1]) == 2 res = [] res.append(result[0].asnumpy().tolist()) res.append(result[1][0].asnumpy().tolist()) res.append(result[1][1][0].asnumpy().tolist()) tvm.testing.assert_allclose(res, np.array([3, 2, 1]))
def test_loop(): mod = relay.module.Module({}) sum_up = relay.GlobalVar('sum_up') i = relay.var('i', shape=[], dtype='int32') accum = relay.var('accum', shape=[], dtype='int32') sb = ScopeBuilder() with sb.if_scope(relay.equal(i, relay.const(0, 'int32'))): sb.ret(accum) with sb.else_scope(): one_less = relay.subtract(i, relay.const(1, 'int32')) new_accum = relay.add(accum, i) sb.ret(relay.Call(sum_up, [one_less, new_accum])) func = relay.Function([i, accum], sb.get()) mod[sum_up] = func loop_bound = 0 i_data = np.array(loop_bound, dtype='int32') accum_data = np.array(0, dtype='int32') iarg = relay.var('i', shape=[], dtype='int32') aarg = relay.var('accum', shape=[], dtype='int32') mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg)) exe = create_exec(mod) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) result = veval(des_vm, i_data, accum_data) tvm.testing.assert_allclose(result.asnumpy(), sum(range(1, loop_bound + 1)))
def get_serialized_output(mod, *data, params=None, target="llvm", ctx=tvm.cpu()): exe = create_exec(mod, target, params=params) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec, ctx) result = des_vm.run(*data) return result
def get_vm_output(mod, data, params, target, ctx, dtype='float32', number=2, repeat=20): with tvm.transform.PassContext(opt_level=3): exe = vm.compile(mod, target, params=params) rly_vm = vm_rt.VirtualMachine(exe) rly_vm.init(ctx) result = rly_vm.run(data) if measure: print("Evaluate vm inference cost of {} on {}".format( model, repr(ctx))) ftimer = rly_vm.mod.time_evaluator("invoke", ctx, number=number, repeat=repeat) # Measure in millisecond. prof_res = np.array(ftimer("main", data).results) * 1000 print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) return result.asnumpy().astype(dtype)
def test_save_load(): x = relay.var("x", shape=(10, 10)) f = relay.Function([x], x + x) x_data = np.random.rand(10, 10).astype("float32") # serialize. vm = create_exec(f) code, lib = vm.save() assert isinstance(code, bytearray) # save and load the code and lib file. tmp = utils.tempdir() path_lib = tmp.relpath("lib.so") lib.export_library(path_lib) with open(tmp.relpath("code.ro"), "wb") as fo: fo.write(code) loaded_lib = tvm.runtime.load_module(path_lib) loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read()) # deserialize. des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib) des_vm = _vm.VirtualMachine(des_exec, tvm.cpu()) res = des_vm.run(x_data) tvm.testing.assert_allclose(res.asnumpy(), x_data + x_data)
def __init__(self, mod, ctx, target): if mod is None: raise RuntimeError("Must provide module to get VM executor.") self.mod = mod self.ctx = ctx self.target = target self.executable = compile(mod, target) self.vm = vm_rt.VirtualMachine(self.executable, ctx)
def get_serialized_output(mod, data, params, target, ctx, dtype='float32'): exe = create_exec(mod, target, params=params) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(ctx) result = des_vm.run(data) return result.asnumpy().astype(dtype)
def test_vm_onnx_process(): import onnx onnx_model_path = "/data00/cuiqing.li/onnx_models/sr_dy.onnx" onnx_model = onnx.load(onnx_model_path) shape_dict = {"input.1": (1, 1, 640, 360)} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) target = tvm.target.cuda() ctx = tvm.context(str(target), 0) with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): exe = vm.compile(mod, target, params=params) code, lib = exe.save() saved_dir = "tmp" if os.path.isdir("./tmp") == False: os.system("mkdir {}".format(saved_dir)) path_lib = os.path.join(saved_dir, "lib.so") lib.export_library(path_lib) code_path = os.path.join(saved_dir, "code.ro") with open(code_path, "wb") as fo: fo.write(code) loaded_lib = tvm.runtime.load_module(path_lib) loaded_code = bytearray(open(code_path, "rb").read()) # deserialize. des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib) des_vm = _vm.VirtualMachine(des_exec, ctx) input_shape = [1, 1, 640, 360] dtype = "float32" data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) data = [] data.append(data_tvm) data = tuple(data) res = des_vm.run(*data) print("Evaluate vm inference cost of {} on {}".format( "your testing model", repr(ctx))) ftimer_warmup = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=50) # Measure in millisecond. print("finished warming up and start testing vm compile performance") ftimer = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=600) # Measure in millisecond. prof_res = np.array(ftimer("main", *data).results) * 1000 #prof_res = np.array(ftimer().results) * 1000 print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def _make_executor(self, expr=None): if expr: self.mod["main"] = expr self.executable = compile(self.mod, self.target) self.vm = vm_rt.VirtualMachine(self.executable, self.device) def _vm_wrapper(*args, **kwargs): args = self._convert_args(self.mod["main"], args, kwargs) return self.vm.run(*args) return _vm_wrapper
def test_const(): c = relay.const(1.0, "float32") x = relay.var('x', shape=(10, 10), dtype='float32') f = relay.Function([x], x + c) exe = create_exec(f) code, lib = exe.save() assert isinstance(code, bytearray) des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) x_data = np.random.rand(10, 10).astype('float32') res = veval(des_vm, x_data) tvm.testing.assert_allclose(res.asnumpy(), x_data + 1)
def test_tuple(): ttype = relay.TupleType([relay.TensorType((1,)), relay.TensorType((10,))]) tup = relay.var('tup', type_annotation=ttype) f = relay.Function([tup], relay.TupleGetItem(tup, 1)) i_data = np.random.rand(41).astype('float32') j_data = np.random.rand(10).astype('float32') exe = create_exec(f) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) result = veval(des_vm, (i_data, j_data)) tvm.testing.assert_allclose(result.asnumpy(), j_data)
def test_closure(): x = relay.var('x', shape=()) y = relay.var('y', shape=()) f = relay.Function([x], x + y) ff = relay.Function([y], f) clo = ff(relay.const(1.0)) main = clo(relay.const(2.0)) exe = create_exec(main) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) res = veval(des_vm) tvm.testing.assert_allclose(res.asnumpy(), 3.0)
def test_adt_compose(): mod = relay.Module() p = Prelude(mod) compose = p.compose # add_one = fun x -> x + 1 sb = relay.ScopeBuilder() x = relay.var('x', 'float32') x1 = sb.let('x1', x) xplusone = x1 + relay.const(1.0, 'float32') sb.ret(xplusone) body = sb.get() add_one = relay.GlobalVar("add_one") add_one_func = relay.Function([x], body) # add_two = compose(add_one, add_one) sb = relay.ScopeBuilder() y = relay.var('y', 'float32') add_two_func = sb.let('add_two', compose(add_one_func, add_one_func)) add_two_res = add_two_func(y) sb.ret(add_two_res) add_two_body = sb.get() mod[add_one] = add_one_func f = relay.Function([y], add_two_body) mod["main"] = f exe = create_exec(mod) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) x_data = np.array(np.random.rand()).astype('float32') result = veval(des_vm, x_data) tvm.testing.assert_allclose(result.asnumpy(), x_data + 2.0)
def main(): params_dict = config saved_dir = params_dict['saved_dir'] #target = "cuda -libs=cudnn,cublas" target = params_dict['target'] ctx = tvm.context(str(target), 0) path_lib = os.path.join(saved_dir, "lib.so") code_path = os.path.join(saved_dir, "code.ro") loaded_lib = tvm.runtime.load_module(path_lib) loaded_code = bytearray(open(code_path, "rb").read()) # deserialize. des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib) des_vm = _vm.VirtualMachine(des_exec, ctx) data = [] dtype = params_dict['dtype'] for input_shape in params_dict['inference_input_shapes']: data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) data.append(data_tvm) data = tuple(data) res = des_vm.run(*data) print("Evaluate vm inference cost of {} on {}".format( "your testing model", repr(ctx))) ftimer_warmup = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=50) # Measure in millisecond. print("finished warming up and start testing vm compile performance") ftimer = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=600) # Measure in millisecond. prof_res = np.array(ftimer("main", *data).results) * 1000 #prof_res = np.array(ftimer().results) * 1000 print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_if(): x = relay.var('x', shape=(10, 10)) y = relay.var('y', shape=(10, 10)) equal = relay.op.equal(x, y) equal = relay.op.nn.batch_flatten(equal) f = relay.Function([x, y], relay.If(relay.op.min(equal, axis=[0, 1]), x, y)) x_data = np.random.rand(10, 10).astype('float32') y_data = np.random.rand(10, 10).astype('float32') exe = create_exec(f) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) des_vm = _vm.VirtualMachine(des_exec) des_vm.init(tvm.cpu()) # same res = veval(des_vm, x_data, x_data) tvm.testing.assert_allclose(res.asnumpy(), x_data) # diff res = veval(des_vm, x_data, y_data) tvm.testing.assert_allclose(res.asnumpy(), y_data)
def run_module( tvmc_package: TVMCPackage, device: str, hostname: Optional[str] = None, port: Union[int, str] = 9090, rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", repeat: int = 10, number: int = 10, profile: bool = False, end_to_end: bool = False, options: dict = None, ): """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled with zeroes, ones or random data. Parameters ---------- tvmc_package: TVMCPackage The compiled model package object that will be run. device: str, the device (e.g. "cpu" or "cuda") to be targeted by the RPC session, local or remote). hostname : str, optional The hostname of the target device on which to run. port : int, optional The port of the target device on which to run. rpc_key : str, optional The tracker key of the target device. If this is set, it will be assumed that remote points to a tracker. inputs : dict, optional A dictionary that maps input names to numpy values. If not provided, inputs will be generated using the fill_mode argument. fill_mode : str, optional The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". repeat : int, optional How many times to repeat the run. number : int, optional The number of runs to measure within each repeat. profile : bool Whether to profile the run with the debug executor. end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate of how long running the model in production would take. Returns ------- outputs : dict a dictionary with output tensors, generated by the module times : list of str execution times generated by the time evaluator """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( "This model doesn't seem to have been compiled yet. " "Try calling tvmc.compile on the model before running it.") with ExitStack() as stack: # Currently only two package formats are supported: "classic" and # "mlf". The later can only be used for micro targets, i.e. with microTVM. if device == "micro": if tvmc_package.type != "mlf": raise TVMCException( f"Model {tvmc_package.package_path} is not a MLF archive.") project_dir = get_project_dir(tvmc_package.project_dir) # This is guaranteed to work since project_dir was already checked when # building the dynamic parser to accommodate the project options, so no # checks are in place when calling GeneratedProject. project_ = project.GeneratedProject.from_directory( project_dir, options) else: if tvmc_package.type == "mlf": raise TVMCException( "You're trying to run a model saved using the Model Library Format (MLF). " "MLF can only be used to run micro device ('--device micro')." ) if hostname: if isinstance(port, str): port = int(port) # Remote RPC if rpc_key: logger.debug("Running on remote RPC tracker with key %s.", rpc_key) session = request_remote(rpc_key, hostname, port, timeout=1000) else: logger.debug("Running on remote RPC with no key.") session = rpc.connect(hostname, port) elif device == "micro": # Remote RPC (running on a micro target) logger.debug("Running on remote RPC (micro target).") try: session = tvm.micro.Session(project_.transport()) stack.enter_context(session) except: raise TVMCException( "Could not open a session with the micro target.") else: # Local logger.debug("Running a local session.") session = rpc.LocalSession() # Micro targets don't support uploading a model. The model to be run # must be already flashed into the micro target before one tries # to run it. Hence skip model upload for micro targets. if device != "micro": session.upload(tvmc_package.lib_path) lib = session.load_module(tvmc_package.lib_name) # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("Device is %s.", device) if device == "cuda": dev = session.cuda() elif device == "cl": dev = session.cl() elif device == "metal": dev = session.metal() elif device == "vulkan": dev = session.vulkan() elif device == "rocm": dev = session.rocm() elif device == "micro": dev = session.device lib = session.get_system_lib() else: assert device == "cpu" dev = session.cpu() if tvmc_package.type == "vm": assert inputs is not None, "vm runner requires inputs to be provided as a dict" input_tensor = {} for e, i in inputs.items(): input_tensor[e] = tvm.nd.array(i, dev) if profile: logger.debug("Creating vm with profile enabled.") exe = profiler_vm.VirtualMachineProfiler(lib, dev) res = exe.profile(**input_tensor, func_name="main") # This print is intentional print(res) else: exe = vm.VirtualMachine(lib, dev) exe_outputs = exe.invoke("main", **input_tensor) times = exe.benchmark( dev, **input_tensor, func_name="main", repeat=repeat, number=number, end_to_end=end_to_end, ) # Special handling if the output only has a single value if not isinstance(exe_outputs, list): exe_outputs = [exe_outputs] outputs = {} for i, val in enumerate(exe_outputs): output_name = "output_{}".format(i) outputs[output_name] = val.numpy() else: # TODO(gromero): Adjust for micro targets. if profile: logger.debug("Creating runtime with profiling enabled.") module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof") else: if device == "micro": logger.debug( "Creating runtime (micro) with profiling disabled.") module = tvm.micro.create_local_graph_executor( tvmc_package.graph, lib, dev) else: logger.debug("Creating runtime with profiling disabled.") module = executor.create(tvmc_package.graph, lib, dev) logger.debug("Loading params into the runtime module.") module.load_params(tvmc_package.params) logger.debug("Collecting graph input shape and type:") shape_dict, dtype_dict = module.get_input_info() logger.debug("Graph input shape: %s", shape_dict) logger.debug("Graph input type: %s", dtype_dict) inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode) logger.debug("Setting inputs to the module.") module.set_input(**inputs_dict) # Run must be called explicitly if profiling if profile: logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional print(report) if device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. module.run() times = [] else: # Call the benchmarking function of the executor. # Optionally measure e2e data transfers from the # CPU to device memory overheads (e.g. PCIE # overheads if the device is a discrete GPU). if end_to_end: dev = session.cpu() times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end) logger.debug("Collecting the output tensors.") num_outputs = module.get_num_outputs() outputs = {} for i in range(num_outputs): output_name = "output_{}".format(i) outputs[output_name] = module.get_output(i).numpy() return TVMCResult(outputs, times)
def vm_tensorflow_model_process(): def normalize_node_name(nodes): from tensorflow.compat import as_text if isinstance(nodes, list): ret = [as_text(node.split(':', 1)[0], 'ascii') for node in nodes] else: ret = as_text(nodes.split(':', 1)[0], 'ascii') return ret import tensorflow as tf from tvm.relay.frontend.tensorflow_parser import TFParser TF_pb_path = "/home/tiger/cuiqing.li/models/TF_checkpoint/latest" graph_def = TFParser(TF_pb_path).parse() input_names = ["input_ids_1:0", "input_mask_1:0", "segment_ids_1:0"] output_names = ["loss/Softmax:0"] input_shapes = [[1, 256], [1, 256], [1, 256]] input_names = [normalize_node_name(i) for i in input_names] output_names = [normalize_node_name(i) for i in output_names] mod, params = relay.frontend.from_tensorflow( graph_def, shape={k: v for k, v in zip(input_names, input_shapes)}, layout=None, outputs=output_names) desired_layouts = {'nn.conv2d': ['NCHW', 'default']} seq = tvm.transform.Sequential([ relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts) ]) with tvm.ir.transform.PassContext(opt_level=3): mod = seq(mod) target = tvm.target.cuda() ctx = tvm.context(str(target), 0) with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): exe = vm.compile(mod, target, params=params) code, lib = exe.save() saved_dir = "tmp" if os.path.isdir("./tmp") == False: os.system("mkdir {}".format(saved_dir)) path_lib = os.path.join(saved_dir, "lib.so") lib.export_library(path_lib) code_path = os.path.join(saved_dir, "code.ro") with open(code_path, "wb") as fo: fo.write(code) loaded_lib = tvm.runtime.load_module(path_lib) loaded_code = bytearray(open(code_path, "rb").read()) # deserialize. des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib) des_vm = _vm.VirtualMachine(des_exec, ctx) data = [] idx = 0 for input_shape in input_shapes: dtype = "int32" data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype), ctx) data.append(data_tvm) idx += 1 data = tuple(data) res = des_vm.run(*data) print("Evaluate vm inference cost of {} on {}".format( "your testing model", repr(ctx))) ftimer_warmup = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=50) # Measure in millisecond. print("finished warming up and start testing vm compile performance") ftimer = des_vm.module.time_evaluator("invoke", ctx, number=1, repeat=100) # Measure in millisecond. prof_res = np.array(ftimer("main", *data).results) * 1000 #prof_res = np.array(ftimer().results) * 1000 print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))