def init_remote(vta_env, config): """Create an RPC session based on the given config.""" if vta_env.TARGET in ['sim', 'tsim']: # To target the simulator, we use a local RPC session as the execution # remote. remote = rpc.LocalSession() return remote, None else: # Get remote from tracker node if environment variable is set. # To set up the tracker, you'll need to follow the "Auto-tuning # a convolutional network for VTA" tutorial. tracker_host = config.get('tracker_host', None) tracker_port = config.get('tracker_port', None) # Otherwise if you have a device you want to program directly from # the host, make sure you've set the variables below to the IP of # your board. device_host = config.get('pynq_rpc_host', '192.168.2.99') device_port = config.get('pynq_rpc_port', 9091) if not tracker_host or not tracker_port: remote = rpc.connect(device_host, device_port) else: remote = autotvm.measure.request_remote(vta_env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) return remote
def program_rpc_bitstream(path=None): """Program the FPGA on the RPC server Parameters ---------- path : path to bitstream (optional) """ assert tvm.runtime.enabled("rpc") remote = rpc.connect(host, port) program_fpga(remote, path)
def program_rpc_bitstream(path=None): """Program the FPGA on the RPC server Parameters ---------- path : path to bitstream (optional) """ assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) program_fpga(remote, path)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET not in ["sim", "tsim"]: assert tvm.runtime.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context(target): # load pre-tuned schedule parameters run_gemm(env, remote, target, batch, in_feat, out_feat)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET not in ["sim", "tsim"]: assert tvm.module.enabled("rpc") program_fpga(remote, bitstream="/home/did/tvm_did/vta/build/hardware/xilinx/vivado/pynq_1x16_i8w8a32_15_15_18_17/export/vta.bit") reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context(target): # load pre-tuned schedule parameters for _, wl in resnet_wkls: print(wl) run_conv2d(env, remote, wl, target)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET not in ["sim", "tsim"]: assert tvm.runtime.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context(target): # load pre-tuned schedule parameters for _, wl in dcgan_wklds: print(wl) run_conv2d_transpose(env, remote, wl, target)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET != "sim": assert tvm.module.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context( target): # load pre-tuned schedule parameters for _, wl in resnet_wkls: print(wl) run_conv2d(env, remote, wl, target)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET not in ["sim", "tsim"]: assert tvm.runtime.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu for wkls in target_workloads: with autotvm.tophub.context(target): # load pre-tuned schedule parameters for _, wl in wkls: print(wl) run_conv2d(env, remote, wl, target) # Add to avoid rpc crash time.sleep(1)
def _run(env, remote): if device == "vta": target = env.target if env.TARGET not in ["sim", "tsim"]: assert tvm.module.enabled("rpc") custom_bitstream = os.environ.get( "TVM_HOME" ) + "/vta/build/hardware/xilinx/vivado/pynq_1x16_i8w8a32_15_15_18_17/export/vta.bit" if (not os.path.exists(custom_bitstream)): print("BITSTREAM ERROR: file " + custom_bitstream + " does not exist. Will use a default bitstream...") program_fpga(remote, bitstream=None) else: program_fpga(remote, bitstream=custom_bitstream) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu with autotvm.tophub.context( target): # load pre-tuned schedule parameters for _, wl in resnet_wkls: print(wl) run_conv2d(env, remote, wl, target)
def compile_model(self): if device == 'vta': self.remote = rpc.connect(self.pynq_addr, 9091) vta.reconfig_runtime(self.remote) vta.program_fpga(self.remote, bitstream=None) else: self.remote = rpc.LocalSession() self.ctx = self.remote.ext_dev( 0) if device == 'vta' else self.remote.cpu(0) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {'data': 'float32'} shape_dict = {'data': (env.BATCH, 3, 224, 224)} gluon_model = vision.resnet18_v1( pretrained=True, ctx=ctx ).features if args.nonsplit else splitnet.resnet18_v1_split( self.id + 1) # Measure build start time build_start = time.time() # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == 'vta': assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name != 'vta': graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) self.params = params # Measure Relay build time build_time = time.time() - build_start print(f'inference graph for thread {self.id} built in {0:.4f}s!'. format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) self.remote.upload(temp.relpath('graphlib.o')) lib = self.remote.load_module('graphlib.o') # Graph runtime self.m = graph_runtime.create(graph, lib, self.ctx)
# your board. device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") if not tracker_host or not tracker_port: remote = rpc.connect(device_host, int(device_port)) else: remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) # Reconfigure the JIT runtime and FPGA. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. reconfig_start = time.time() vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) reconfig_time = time.time() - reconfig_start print( "Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) # In simulation mode, host the RPC server locally. else: remote = rpc.LocalSession() # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) #################################### # Build the inference graph runtime. # ---------------------------------- # Using Darknet library load downloaded vision model and compile with Relay.
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, ops=(tvm.relay.op.nn.conv2d, ), target=target, target_host=env.target_host) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run_through_rpc(measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if hasattr(measure_input.target, 'device_name') and \ measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) time_f = func.time_evaluator(func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. args = [ nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info ] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + '.so') remote.remove('') if len(costs ) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]), ) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
# We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET == "pynq": # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) # Reconfigure the JIT runtime vta.reconfig_runtime(remote) # Program the FPGA with a pre-compiled VTA bitstream. # You can program the FPGA with your own custom bitstream # by passing the path to the bitstream file instead of None. vta.program_fpga(remote, bitstream=None) # In simulation mode, host the RPC server locally. elif env.TARGET == "sim": remote = rpc.LocalSession() ###################################################################### # Computation Declaration # ----------------------- # In this example we describe a simple matrix multiplication addition, which # requires multiple computation stages, as shown in the dataflow diagram below. # First we describe the input tensors :code:`A` and :code:`B` that are living # in main memory. # Second, we need to declare intermediate tensors :code:`A_buf` and # :code:`B_buf`, which will live in VTA's on-chip buffers. # Having this extra computational stage allows us to explicitly
def run_through_rpc( measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None, enable_cpu_cache_flush=False, ): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if ( hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta" ): # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold # the PackedFunc as an object. Currently, we pass function name to work # around it. f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = func.time_evaluator( func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, f_preproc=f_prepare, ) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: try: random_fill = remote.get_function("tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] for arg in args: random_fill(arg) ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + ".so") remote.remove("") if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[: msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[: msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
# We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the config.json file. if env.TARGET == "pynq": # try init if os.environ.get('INIT_PYNQ', '').lower() == 'yes': print('') print('Initializing board ...') from tvm import rpc from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) program_fpga(remote, None) # None -> path # remote = rpc.connect(host, port) reconfig_runtime(remote) print('') # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) dt = time.time() # Reconfigure the JIT runtime vta.reconfig_runtime(remote) # Program the FPGA with a pre-compiled VTA bitstream.