def adaptive_evaluator(epsilon, remote, build_result, measure_input, ref_input, number, repeat, min_repeat_ms): # print("####in adaptive evaluator###") func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) flop = measure_input.task.flop # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. args = [ nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info ] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() # break the number*repeat into several batch # print("number=%d, repeat=%d" % (number, repeat)) if repeat * number < 300: # no need to do adaptive evaluator time_f = func.time_evaluator(func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) eva_res = time_f(*args) costs = eva_res.results else: b_size = 50 costs = [] sum_num = 0 max_iter = number * repeat pis = [] bi = 1 rep = 0 flag = True while flag and sum_num < max_iter: time_f = func.time_evaluator(func.entry_name, ctx, number=b_size, repeat=1, min_repeat_ms=min_repeat_ms) b_mean = time_f(*args).mean # print("b_mean:" + str(b_mean)) costs.append(b_mean) sum_num = sum_num + b_size # calculate the flops of per batch: flop/time_mean_batch_i pi = flop / b_mean pis.append(pi) pis_array = np.array(pis) if len(pis_array) > 4: # remove the min and max to reduce variance pis_array.sort() pis_array = pis_array[1:-1] # calculate the coefficient of variation cv = pis_array.std() / pis_array.mean() if bi > 2 and cv < epsilon: # print("\nindex is %d, break at batch#%d, cv=%.10f, cost is %.8f." % (measure_input.config.index, bi, cv, b_mean)) flag = False bi = bi + 1 return costs
def run_through_rpc(measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if hasattr(measure_input.target, 'device_name') and \ measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) time_f = func.time_evaluator(func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: # create empty arrays on the remote device and copy them once. # This can avoid some memory issues that make the measurement results unreliable. args = [ nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info ] args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + '.so') remote.remove('') if len(costs ) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]), ) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
def run_through_rpc( measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None, enable_cpu_cache_flush=False, ): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_args: Tuple The argument for request_remote ref_input: List of np.ndarray The reference input used for checking correctness ref_output: List of np.ndarray The reference output used for checking correctness enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module remote = request_remote(*remote_args) # Program the FPGA every single time when targeting VTA if ( hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta" ): # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime program_fpga(remote, None) reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold # the PackedFunc as an object. Currently, we pass function name to work # around it. f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = func.time_evaluator( func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, f_preproc=f_prepare, ) # set input if ref_input: args = [nd.array(x, ctx=ctx) for x in ref_input] else: try: random_fill = remote.get_function("tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] for arg in args: random_fill(arg) ctx.sync() costs = time_f(*args).results # clean up remote files remote.remove(build_result.filename) remote.remove(os.path.splitext(build_result.filename)[0] + ".so") remote.remove("") if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) # check correctness of output if ref_output: for expected, real in zip(ref_output, args): if not np.allclose(expected, real.asnumpy(), rtol=1e-4): logger.warning("Wrong Answer!") errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[: msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[: msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
def run_through_rpc( measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_kwargs, ref_input, enable_cpu_cache_flush=False, module_loader=None, ): """Run a generated library through rpc Parameters ---------- measure_input: MeasureInput The raw measure input build_result: BuildResult The result returned from Builder. This contains the path to the generated library. number: int The number of times to run the generated code for taking average. We call these runs as one `repeat` of measurement. repeat : int, optional The number of times to repeat the measurement. In total, the generated code will be run (1 + number x repeat) times, where the first one is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. min_repeat_ms: int, optional The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the minimum duration requirement of one `repeat`. i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. cooldown_interval: float The cool down interval between two measurements remote_kwargs: dict Passed to module_loader(). Ultimately, keyword args to request_remote(). ref_input: List of np.ndarray The reference input used for tuning. Empty for randomly filled input. enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. module_loader: ModuleLoader A function that returns a ContextManager used to establish and teardown the remote session. """ if isinstance(build_result, MeasureResult): return build_result tic = time.time() errno = MeasureErrorNo.NO_ERROR try: # upload built module with module_loader(remote_kwargs, build_result) as (remote, mod): dev = remote.device(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold # the PackedFunc as an object. Currently, we pass function name to work # around it. f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = mod.time_evaluator( mod.entry_name, dev, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, f_preproc=f_prepare, ) if ref_input: args = [nd.array(x, device=dev) for x in ref_input] else: try: random_fill = remote.get_function( "tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices") args = [ nd.empty(x[0], x[1], dev) for x in build_result.arg_info ] if "scatter" not in measure_input.task.name: # the index tensor of scatter op cannot be randomly initialized for arg in args: random_fill(arg) dev.sync() costs = time_f(*args).results if len(costs ) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] if "CUDA Source" in msg: msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]), ) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)