def compile_network(opt, env, target): # Populate the shape and data type dictionary dtype_dict = {"data": 'float32'} shape_dict = {"data": (env.BATCH, 3, 224, 224)} # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(opt.model, pretrained=True) mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod[mod.entry_func], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack( relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=opt.start_name, stop_name=opt.stop_name) return relay_prog, params
def compile_network(env, target, model, start_pack, stop_pack): # Populate the shape and data type dictionary dtype_dict = {"data": 'float32'} shape_dict = {"data": (env.BATCH, 3, 224, 224)} # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(model, pretrained=True) mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) return relay_prog, params
def build_model(model_name, remote, target, ctx, vta_env): """Build the inference graph runtime.""" # Load pre-configured AutoTVM schedules. with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input. dtype_dict = {'data': 'float32'} shape_dict = {'data': (vta_env.BATCH, 3, 224, 224)} # Get off-the-shelf gluon model and convert to Relay. gluon_model = vision.get_model(model_name, pretrained=True) # Start frontend compilation. mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary. shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay. with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target. if target.device_name == 'vta': assert vta_env.BLOCK_IN == vta_env.BLOCK_OUT relay_prog = graph_pack(relay_prog, vta_env.BATCH, vta_env.BLOCK_OUT, vta_env.WGT_WIDTH, start_name=START_PACK, stop_name=STOP_PACK) # Compile Relay program with AlterOpLayout disabled. with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name == 'vta': with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=vta_env.target, params=params, target_host=vta_env.target_host) else: graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=vta_env.target_host) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) remote.upload(temp.relpath('graphlib.o')) lib = remote.load_module('graphlib.o') graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**params) return graph_module
def compile_mxnet_gulon_resnet(_env, _model): """ Compile Model """ # Generate tvm IR from mxnet gluon model # Populate the shape and data type dictionary for ImageNet classifier input dtype_dict = {"data": 'float32'} shape_dict = {"data": (_env.BATCH, 3, 224, 224)} # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(_model, pretrained=True) # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) mod = merge_transform_to_mxnet_model(mod) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(_env.target): # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target relay_prog = graph_pack(mod["main"], _env.BATCH, _env.BLOCK_IN, _env.WGT_WIDTH, start_name=PACK_DICT[_model][0], stop_name=PACK_DICT[_model][1]) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): with vta.build_config(debug_flag=0): graph, lib, params = relay.build(relay_prog, target=_env.target, params=params, target_host=_env.target_host) return graph, lib, params
def convert_to_vta(model_path, image_channel, image_size): device = torch.device('cpu') model = torch.load(model_path, map_location=device) model = model.eval() input_shape = [1, image_channel, image_size, image_size] input_data = torch.randn(input_shape) scripted_model = torch.jit.trace(model, input_data).eval() shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) print(mod["main"]) remote = rpc.LocalSession() ctx = remote.ext_dev(0) target = 'vta' target_host = 'vta' env = vta.get_env() pack_dict = { "yolov3-tiny": ["nn.max_pool2d", "cast", 8, 237], } MODEL_NAME = 'yolov3-tiny' with tvm.transform.PassContext(opt_level=2): with relay.quantize.qconfig(global_scale=33.0, skip_conv_layers=[0], store_lowbit_output=True, round_for_shift=True): mod = relay.quantize.quantize(mod, params=params) print(mod["main"]) mod = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[MODEL_NAME][0], stop_name=pack_dict[MODEL_NAME][1], start_name_idx=pack_dict[MODEL_NAME][2], stop_name_idx=pack_dict[MODEL_NAME][3]) return mod
def compile_model(self): if device == 'vta': self.remote = rpc.connect(self.pynq_addr, 9091) vta.reconfig_runtime(self.remote) vta.program_fpga(self.remote, bitstream=None) else: self.remote = rpc.LocalSession() self.ctx = self.remote.ext_dev( 0) if device == 'vta' else self.remote.cpu(0) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {'data': 'float32'} shape_dict = {'data': (env.BATCH, 3, 224, 224)} gluon_model = vision.resnet18_v1( pretrained=True, ctx=ctx ).features if args.nonsplit else splitnet.resnet18_v1_split( self.id + 1) # Measure build start time build_start = time.time() # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == 'vta': assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name != 'vta': graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) self.params = params # Measure Relay build time build_time = time.time() - build_start print(f'inference graph for thread {self.id} built in {0:.4f}s!'. format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) self.remote.upload(temp.relpath('graphlib.o')) lib = self.remote.load_module('graphlib.o') # Graph runtime self.m = graph_runtime.create(graph, lib, self.ctx)
mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=dshape) if target.device_name == "vta": # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=33.0, skip_conv_layers=[0], store_lowbit_output=True, round_for_shift=True): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target mod = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[MODEL_NAME][0], stop_name=pack_dict[MODEL_NAME][1], start_name_idx=pack_dict[MODEL_NAME][2], stop_name_idx=pack_dict[MODEL_NAME][3]) else: mod = mod["main"] # Compile Relay program with AlterOpLayout disabled with vta.build_config(disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(mod, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start
# Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) if target.device_name == "vta": # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[model][0], stop_name=pack_dict[model][1]) else: relay_prog = mod["main"] # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build(relay_prog,
# Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod["main"], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack( relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host)
if target.device_name == "vta": # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with tvm.transform.PassContext(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target assert env.BLOCK_IN == env.BLOCK_OUT # do device annotation if target is intelfocl or sim relay_prog = graph_pack( mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[model][0], stop_name=pack_dict[model][1], device_annot=(env.TARGET == "intelfocl"), ) else: relay_prog = mod["main"] # Compile Relay program with AlterOpLayout disabled if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host)
def main(model, start_pack, stop_pack, data_shape=(1, 3, 224, 224), dtype='float32'): # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") ###################################################################### # Define the platform and model targets # ------------------------------------- # Execute on CPU vs. VTA, and define the model. # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. ###################################################################### # Obtain an execution remote # --------------------------------- # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. print(f"Target is {env.TARGET}") if env.TARGET in ["sim", "tsim"]: remote = rpc.LocalSession() else: print(f"Error, incorrect target for benchmarking: {env.TARGET}") # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### # Build the inference graph runtime # --------------------------------- # Grab ResNet-18 model from Gluon model zoo and compile with Relay. # The compilation steps are: # 1) Front end translation from MxNet into Relay module. # 2) Apply 8-bit quantization: here we skip the first conv layer, # and dense layer which will both be executed in fp32 on the CPU. # 3) Perform graph packing to alter the data layout for tensorization. # 4) Perform constant folding to reduce number of operators (e.g. eliminate # batch norm multiply). # 5) Perform relay build to object file. # 6) Load the object file onto remote (FPGA device). # 7) Generate graph runtime, `m`. # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {"data": 'float32'} shape_dict = {"data": data_shape} # Measure build start time build_start = time.time() # Start front end compilation if model == 'resnet': mod, params = test_resnet_mxnet(env) elif model == 'yolo': mod, params = test_yolo_darknet() elif model == 'lenet': mod, params = lenet() elif model == 'mobilenet': mod, params = mobilenet() else: print(f"Error, incorrect model name: {model}") ### Need to bind params # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) print(f"Finishing quantizing graph") # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) print(f"Finishing packing graph") # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Graph runtime m = graph_runtime.create(graph, lib, ctx) # # # Set the network parameters and inputs data = np.random.uniform(size=data_shape).astype(dtype) m.set_input(**params) m.set_input('data', tvm.nd.array(data.astype(dtype))) # Perform inference and gather execution statistics # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator num = 1 # number of times we run module for a single measurement rep = 1 # number of measurements (we derive std dev from this) timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() timer() sim_stats = simulator.stats() print("\nExecution statistics:") for k, v in sim_stats.items(): # Since we execute the workload many times, we need to normalize stats # Note that there is always one warm up run # Therefore we divide the overall stats by (num * rep + 1) print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) else: tcost = timer() std = np.std(tcost.results) * 1000 mean = tcost.mean * 1000 print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) print("Average per sample inference time: %.2fms" % (mean / env.BATCH))