]) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) tvm_target = get_tvm_target(device, get_device_type(), get_device_arch(), get_device_attributes()) tvm_targets = tvm.target.Target(tvm_target) cpu_target = "llvm" target_host = cpu_target cpudevice = tvm.runtime.cpu() if logfile is not None: with autotvm.apply_history_best(logfile): with tvm.transform.PassContext(opt_level=3): graph_mod = relay.build(mod, tvm_targets, params=params, target_host=target_host) else: with tvm.transform.PassContext(opt_level=3): graph_mod = relay.build(mod, tvm_targets, params=params, target_host=target_host) lib = graph_mod.get_lib() params = graph_mod.get_params() graph = graph_mod.get_json()
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_graph(net, target=target, shape={'data': data_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d,)) # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, shape={'data': data_shape}, params=params, dtype=dtype) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host, shape={'data': input_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d, nnvm.sym.dense)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, target_host=target_host, shape={'data': input_shape}, params=params, dtype=dtype) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, 'localhost', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) print(net) input_name = 'Input_0' if network == 'onnx' else 'data' tasks = autotvm.task.extract_from_graph(net, target=target, shape={input_name: input_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d,)) # # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records #with autotvm.apply_history_best('onnx.log'): with autotvm.apply_history_best('gtx-1060.log'): #if True: print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, shape={input_name: input_shape}, params=params, dtype=dtype) # export library # tmp = tempdir() # filename = "net.tar" # lib.export_library(tmp.relpath(filename)) # load parameters ctx = tvm.context('cuda', 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(net, target=target, params=params, ops=(relay.op.nn.conv2d,)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( net, target=target, params=params) # export library tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) # load parameters ctx = tvm.context(str(target), 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def compile_model( path, target, dump_code=None, target_host=None, model_format=None, tuning_records=None, alter_layout=None, shape_dict=None, ): """Compile a model from a supported framework into a TVM module. This function takes a union of the arguments of both frontends.load_model and compiler.compile_relay. The resulting TVM module can be executed using the graph runtime. Parameters ---------- path: str Path to a file target : str The target for which to compile. Can be a plain string or a path. dump_code : list, optional Dump the generated code for the specified source types, on the requested target. target_host : str, optional The target of the host machine if host-side code needs to be generated. model_format: str, optional A string representing a name of a frontend to be used tuning_records: str, optional Path to the file produced by the tuning to be used during compilation. alter_layout: str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. shape_dict: dict, optional A mapping from input names to their shape. When present, the default shapes in the model will be overwritten. Returns ------- graph : str A JSON-serialized TVM execution graph. lib : tvm.module.Module A TVM module containing the compiled functions. params : dict The parameters (weights) for the TVM module. dumps : dict Dictionary containing the dumps specified. """ dump_code = [x.strip() for x in dump_code.split(",")] if dump_code else None mod, params = frontends.load_model(path, model_format, shape_dict) config = {} if alter_layout: mod = common.convert_graph_layout(mod, alter_layout) tvm_target, extra_targets = common.target_from_cli(target) target_host = tvm_target if not target_host else target_host for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params) if codegen["config_key"] is not None: config[codegen["config_key"]] = codegen_from_cli["opts"] if tuning_records and os.path.exists(tuning_records): logger.debug("tuning records file provided: %s", tuning_records) use_autoscheduler = True try: auto_scheduler.load_records(tuning_records) except tvm._ffi.base.TVMError: use_autoscheduler = False if use_autoscheduler: with auto_scheduler.ApplyHistoryBest(tuning_records): config["relay.backend.use_auto_scheduler"] = True with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph with autoscheduler") graph_module = relay.build(mod, target=target, params=params, target_host=target_host) else: with autotvm.apply_history_best(tuning_records): with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph with tuning records") graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host) else: with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph (no tuning records provided)") graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host) # Generate output dump files with sources dump_code = dump_code or [] dumps = {} for source_type in dump_code: lib = graph_module.get_lib() # TODO lib.get_source call have inconsistent behavior for unsupported # formats (@leandron). source = str(mod) if source_type == "relay" else lib.get_source( source_type) dumps[source_type] = source # TODO we need to update this return to use the updated graph module APIs # as these getter functions will be deprecated in the next release (@leandron) return graph_module.get_json(), graph_module.get_lib( ), graph_module.get_params(), dumps
# In practice, making 1000 trials usually can find some good kernels # for this template # logging config (for printing tuning log to screen) logging.basicConfig(level=logging.INFO, stream=sys.stdout) # the last layer in resnet task = autotvm.task.create(conv2d_no_batching, args=(1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)), target='cuda') print(task.config_space) # use local gpu, measure 5 times for every config to reduce variance # run 8 parallel threads for compilation measure_option = autotvm.measure_option(mode='local', number=10, parallel_num=8, timeout=20) # begin tuning, log records to file `cache.tsv` tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=20, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('cache.tsv')]) # get best config from cache file dispatch_context = autotvm.apply_history_best("cache.tsv") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config)
def tune_and_evaluate(tuning_opt, layer_name='qconv_2', input_layout='nchw'): # extract workloads from relay program global output_file print("Extract tasks...") if input_layout == 'nchw': mod, params, input_shape = models.get_bitserial_conv2d_nchw(models.vgg16, layer_name, activation_bits=activation_bits, weight_bits=weight_bits) else: mod, params, input_shape = models.get_bitserial_conv2d_nhwc(models.vgg16, layer_name, activation_bits=activation_bits, weight_bits=weight_bits) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.get("nn.bitserial_conv2d"),)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) log_file = tuning_opt['log_filename'] print('Extract the best from %s' % log_file) specific_layer = log_file.split('.')[0] # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array((np.ones(input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) output_file.write(specific_layer + ',' + str(np.mean(prof_res)) + ',' + str(np.std(prof_res)) + '\n')
last_layer = net.layers[net.n - 1] tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh) tic = time.time() # tvm.relay.testing.yolo_detection.draw_detections( # font_path, img, dets, thresh, names, last_layer.classes) img = img.transpose(1, 2, 0) img = darwBbox(dets, img, thresh, names) img = np.flip(img, 2) tac = time.time() cv2.imshow('DarkNet', img) print(tac - tic, time.time() - tac) res, frame = cap.read() cv2.waitKey(1) cnt += 1 if cnt % steps == 0: end = time.time() print(steps * 1. / (end - start)) start = end cv2.destroyAllWindows() cap.release() if __name__ == '__main__': if LOG_FILE is None: show() else: with autotvm.apply_history_best(LOG_FILE): show()
def test_conv2d_nchw(): # load tophub ctx = autotvm.apply_history_best([]) for device in get_all_backend(): context = autotvm.tophub.context(device) context.__enter__() # ResNet18 workloads verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0) verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1) verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0) verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1) verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0) verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1) verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0) verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) # bias, relu verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True) verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True) # dilation = 2 verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2) # batch size verify_conv2d_nchw(4, 64, 56, 64, 3, 1, 1) verify_conv2d_nchw(9, 64, 56, 64, 3, 1, 1) # weird workloads verify_conv2d_nchw(2, 2, 2, 2, 2, 2, 2) verify_conv2d_nchw(3, 3, 3, 3, 3, 3, 3) verify_conv2d_nchw(4, 4, 4, 4, 4, 4, 4) verify_conv2d_nchw(5, 5, 5, 5, 5, 5, 5) verify_conv2d_nchw(6, 6, 6, 6, 6, 6, 6) # disable these tests due to some bugs of llvm with nvptx # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1) # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2) # verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1) # inception v3 workloads verify_conv2d_nchw(1, 3, 299, 32, 3, 2, 0) verify_conv2d_nchw(1, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw(1, 32, 147, 64, 3, 1, 1) verify_conv2d_nchw(1, 64, 73, 80, 1, 1, 0) verify_conv2d_nchw(1, 80, 73, 192, 3, 1, 0) verify_conv2d_nchw(1, 192, 35, 64, 1, 1, 0) verify_conv2d_nchw(1, 192, 35, 48, 1, 1, 0) verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2) verify_conv2d_nchw(1, 64, 35, 96, 3, 1, 1) verify_conv2d_nchw(1, 96, 35, 96, 3, 1, 1) verify_conv2d_nchw(1, 192, 35, 32, 1, 1, 0) verify_conv2d_nchw(1, 256, 35, 64, 1, 1, 0) verify_conv2d_nchw(1, 256, 35, 48, 1, 1, 0) verify_conv2d_nchw(1, 288, 35, 64, 1, 1, 0) verify_conv2d_nchw(1, 288, 35, 48, 1, 1, 0) verify_conv2d_nchw(1, 288, 35, 384, 3, 2, 0) verify_conv2d_nchw(1, 96, 35, 96, 3, 2, 0) verify_conv2d_nchw(1, 768, 17, 192, 1, 1, 0) verify_conv2d_nchw(1, 768, 17, 128, 1, 1, 0) verify_conv2d_nchw(1, 128, 17, 128, 1, 1, 0) verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3) verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3) verify_conv2d_nchw(1, 128, 17, 192, 1, 1, 0) verify_conv2d_nchw(1, 768, 17, 160, 1, 1, 0) verify_conv2d_nchw(1, 160, 17, 160, 1, 1, 0) verify_conv2d_nchw(1, 160, 17, 192, 7, 1, 3) verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3) verify_conv2d_nchw(1, 160, 17, 192, 1, 1, 0) verify_conv2d_nchw(1, 192, 17, 192, 1, 1, 0) verify_conv2d_nchw(1, 192, 17, 192, 7, 1, 3) verify_conv2d_nchw(1, 192, 17, 320, 3, 2, 0) verify_conv2d_nchw(1, 192, 17, 192, 3, 2, 0) verify_conv2d_nchw(1, 1280, 8, 320, 1, 1, 0) verify_conv2d_nchw(1, 1280, 8, 384, 1, 1, 0) verify_conv2d_nchw(1, 384, 8, 384, 1, 1, 0) verify_conv2d_nchw(1, 384, 8, 384, 3, 1, 1) verify_conv2d_nchw(1, 1280, 8, 448, 1, 1, 0) verify_conv2d_nchw(1, 448, 8, 384, 3, 1, 1) verify_conv2d_nchw(1, 1280, 8, 192, 1, 1, 0) verify_conv2d_nchw(1, 2048, 8, 320, 1, 1, 0) verify_conv2d_nchw(1, 2048, 8, 384, 1, 1, 0) verify_conv2d_nchw(1, 2048, 8, 448, 1, 1, 0) verify_conv2d_nchw(1, 2048, 8, 192, 1, 1, 0) verify_conv2d_nchw(1, 1024, 19, 84, 3, 1, 1) verify_conv2d_nchw(1, 2048, 10, 126, 3, 1, 1) verify_conv2d_nchw(1, 512, 5, 126, 3, 1, 1) verify_conv2d_nchw(1, 256, 3, 126, 3, 1, 1)
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, log_filename, measure_option, tuner, early_stopping, ): #N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1) data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) ) # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) ) origin_layout = 'NCHW' func_create = 'topi_x86_conv2d_NCHW_test' task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, 'float32'), target='llvm -mcpu=core-avx2', template_key='direct') #task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1] print(task.config_space) trials = min(trials, len(task.config_space)) # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task, loss_type='rank') tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename) ]) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=core-avx2"): s, arg_bufs = task.func(data, kernel, strides, padding, 1, origin_layout, 'float32', best_config) func = tvm.build(s, arg_bufs, "llvm -mcpu=core-avx2", name="fconv") print("arg_bufs 0", arg_bufs[0]) print("arg_bufs 1", arg_bufs[1]) print("arg_bufs 2", arg_bufs[2]) # print(func.get_source()) dump = "%s.ll" % log_filename f = open(dump, "a") f.write(func.get_source()) f.close() '''
def run(name, N, H, W, factor, CI, KH, KW, strides, padding, dilation): # s, arg_bufs = depthwise_conv2d_nchw(N, H, W, factor, CI, KH, KW, strides, padding, dilation) task = autotvm.task.create(depthwise_conv2d_nchw, args=(N, H, W, factor, CI, KH, KW, strides, padding, dilation), target='cuda') print(task.config_space) logfile = "depthwise_" + name + ".log" # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( repeat=3, min_repeat_ms=100, timeout=4)) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(logfile)]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best(logfile) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(logfile): with tvm.target.create("cuda"): s, arg_bufs = depthwise_conv2d_nchw(N, H, W, factor, CI, KH, KW, strides, padding, dilation) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CI, factor, KH, KW)).astype(np.float32) # c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty((N, factor * CI, (H + 2 * pad - KH) // stride + 1, (W + 2 * pad - KW) // stride + 1), ctx=ctx) # func(a_tvm, w_tvm, c_tvm) # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=10) cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3 print('Time cost of this operator: %f' % cost) with open("autotvm_conv_nchw.txt", "a") as f: f.write("name, {}\n".format(cost))
def __init__(self, config): cfg_path = config['cfg_path'] weights_path = config['weights_path'] device_type = config['device_type'] autotune = config['autotune'] log_file = config['log_file'] self.thresh = config['thresh'] self.nms_thresh = config['nms_thresh'] DARKNET_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/lib/libdarknet2.0.so?raw=true' lib_path = download_testdata(DARKNET_URL, 'libdarknet2.0.so', module="darknet") DARKNET_LIB = __darknetffi__.dlopen(lib_path) self.net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0) dtype = 'float32' data = np.empty([1, self.net.c, self.net.h, self.net.w], dtype) self.shape = {'data': data.shape} # convert darknet to relay functions mod, params = relay.frontend.from_darknet(self.net, dtype=dtype, shape=data.shape) # import graph to relay if device_type == 'cpu': target = 'llvm' ctx = tvm.cpu(0) if autotune: if not os.path.isfile(log_file): err = "Autotvm log file does not exist." raise NotImplementedError(err) with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) else: with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) elif device_type == 'cuda-cudnn': target = 'cuda -libs=cudnn' ctx = tvm.gpu() if autotune: if not os.path.isfile(log_file): err = "Autotvm log file does not exist." raise NotImplementedError(err) with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) else: with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) elif device_type == 'cuda': target = tvm.target.cuda() ctx = tvm.gpu() if autotune: if not os.path.isfile(log_file): err = "Autotvm log file does not exist." raise NotImplementedError(err) with autotvm.apply_history_best(log_file): with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) else: with relay.build_config(opt_level=3): graph, lib, self.params = relay.build_module.build( mod, target=target, params=params) else: err = "Device type is not supported on this platform." raise NotImplementedError(err) self.m = graph_runtime.create(graph, lib, ctx)
def tune_kernels(args, M, N, P, K, trials, measure_option, tuner, early_stopping,): feature_type = args.feature print('Feature:', feature_type) count = args.num_iters likwid_event = args.likwid_event random = args.random sa_n_iter = args.sa_num_iters save_features = not (args.no_save_features) task = autotvm.task.create("template/tc", args=(M,N,P,K,tc_index,'float32'), target='llvm -mcpu=core-avx2') print(task.config_space) trials = min(trials, len(task.config_space)) for i in range(count): if args.key_id != None and count == 1: save_ind = int(args.key_id) else: save_ind = i if random: log_filename = 'tc%i_%i_%i_%s_%icore_rand.log' % (tc_index, N, save_ind, feature_type, num_threads) else: log_filename = 'tc%i_%i_%i_%s_%icore.log' % (tc_index, N, save_ind, feature_type, num_threads) if likwid_event != None: if random: pickle_file = 'data/tc/likwid_rand_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials, save_ind) else: pickle_file = 'data/tc/likwid_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials, save_ind) else: if random: pickle_file = 'data/tc/rand_tc%i_%i_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials, save_ind) else: pickle_file = 'data/tc/tc%i_%i_new_%s_features_%icore_%i_%i.pkl' % (tc_index, N, feature_type, num_threads, trials, save_ind) if os.path.exists(pickle_file): print('File exists', pickle_file) continue tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank', plan_size=80, sa_n_iter=sa_n_iter) tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename)], likwid_event=likwid_event, save_features=save_features, random=random) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) #print(tvm.lower(s, arg_bufs, simple_mode=True)) if save_features: with open(pickle_file , 'wb') as output: pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL) try: os.remove(log_filename) except: pass
def compile_model( tvmc_model: TVMCModel, target: str, opt_level: int = 3, executor: Optional[Executor] = Executor("graph"), runtime: Optional[Runtime] = Runtime("cpp"), tuning_records: Optional[str] = None, package_path: Optional[str] = None, cross: Optional[Union[str, Callable]] = None, cross_options: Optional[str] = None, output_format: str = "so", dump_code: Optional[List[str]] = None, target_host: Optional[str] = None, desired_layout: Optional[str] = None, disabled_pass: Optional[str] = None, pass_context_configs: Optional[List[str]] = None, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, use_vm: bool = False, mod_name: Optional[str] = "default", ): """Compile a model from a supported framework into a TVM module. This function takes a union of the arguments of both frontends.load_model and compiler.compile_relay. The resulting TVM module can be executed using the graph executor. Parameters ---------- tvmc_model : TVMCModel The model object that should be compiled. target : str The target for which to compile. Can be a plain string or a path. opt_level : int The option that controls various sorts of optimizations. tuning_records : str A path to tuning records produced using tvmc.tune. When provided, compilation will use more optimized kernels leading to better results. package_path : str, optional The path to export the compiled model to. If not provided it will be saved in a temporary directory. cross : str or callable object, optional Function that performs the actual compilation cross_options : str, optional Command line options to be passed to the cross compiler. output_format : str What format to use when saving the function library. Must be one of "so" or "tar". When compiling for a remote device without a cross compiler, "tar" will likely work better. dump_code : list, optional Dump the generated code for the specified source types, on the requested target. target_host : str, optional The target of the host machine if host-side code needs to be generated. desired_layout: str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. disabled_pass: str, optional Comma-separated list of passes which needs to be disabled during compilation pass_context_configs: list[str], optional List of strings containing a set of configurations to be passed to the PassContext. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments use_vm: bool Whether to use the VM to compile the model as opposed to the graph executor mod_name: str, optional The module name Returns ------- compiled_model : TVMCPackage The compiled TVMCModel ready to be run. """ mod, params = tvmc_model.mod, tvmc_model.params config = parse_configs(pass_context_configs) if desired_layout: mod = convert_graph_layout(mod, desired_layout) tvm_target, extra_targets = target_from_cli(target, additional_target_options) tvm_target, target_host = Target.check_and_update_host_consist( tvm_target, target_host) for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] if codegen["config_key"] is not None: config[codegen["config_key"]] = codegen_from_cli["opts"] with tvm.transform.PassContext(config=config): mod = partition_function(mod, params, mod_name=mod_name, **codegen_from_cli["opts"]) if tuning_records and os.path.exists(tuning_records): logger.debug("tuning records file provided: %s", tuning_records) use_autoscheduler = True try: auto_scheduler.load_records(tuning_records) except tvm._ffi.base.TVMError: use_autoscheduler = False if use_autoscheduler: with auto_scheduler.ApplyHistoryBest(tuning_records): config["relay.backend.use_auto_scheduler"] = True with tvm.transform.PassContext(opt_level=opt_level, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph with autoscheduler") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) else: with autotvm.apply_history_best(tuning_records): with tvm.transform.PassContext(opt_level=opt_level, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph with tuning records") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) else: with tvm.transform.PassContext(opt_level=opt_level, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph (no tuning records provided)") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) # Generate output dump files with sources if dump_code is None: dump_code = [] if not isinstance(dump_code, list): dump_code = [dump_code] dumps = {} for source_type in dump_code: if use_vm: lib = graph_module.lib else: lib = graph_module.get_lib() # TODO lib.get_source call have inconsistent behavior for unsupported # formats (@leandron). source = str(mod) if source_type == "relay" else lib.get_source( source_type) dumps[source_type] = source # Create a new tvmc model package object from the graph definition. package_path = tvmc_model.export_package(graph_module, package_path, cross, cross_options, output_format) # Write dumps to file. if dumps: save_dumps(package_path, dumps) return TVMCPackage(package_path)
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, key, measure_option, tuner, early_stopping, ): data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') origin_layout = 'NCHW' if len(sys.argv) > 2: feature_type = sys.argv[2] else: #feature_type = 'datavol' feature_type = 'itervar' #feature_type = 'datavol_itervar' print('Feature:', feature_type) if len(sys.argv) > 3: if 'small' == sys.argv[3]: func_create = 'conv2d_NCHW_small.x86' elif 'wide' == sys.argv[3]: func_create = 'conv2d_NCHW_wide.x86' else: func_create = 'conv2d_NCHWc.x86' else: func_create = 'conv2d_NCHWc.x86' if len(sys.argv) > 4: count = int(sys.argv[4]) else: count = 1 if len(sys.argv) > 5: likwid_event = sys.argv[5] else: likwid_event = None task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, origin_layout, 'float32'), target='llvm -mcpu=core-avx2') using_NCHWc = True print(task.config_space) trials = min(trials, len(task.config_space)) ctx = tvm.cpu() a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding).astype(np.float32) for i in range(count): log_filename = '%s_%i_%s_%s_%icore_rand.log' % ( key, i, feature_type, sys.argv[3], num_threads) tuner = autotvm.tuner.XGBTuner(task, feature_type=feature_type, loss_type='rank', plan_size=32) tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(trials), autotvm.callback.log_to_file(log_filename) ], likwid_event=likwid_event) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=core-avx2"): s, arg_bufs = task.func(*task.args) func = tvm.build(s, arg_bufs) if using_NCHWc: a_np_reshape = a_np.reshape( (N, CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) w_np_reshape = w_np.reshape( (CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], CI // best_config['tile_ic'].size[-1], best_config['tile_ic'].size[-1], KH, KW)).transpose( (0, 2, 4, 5, 3, 1)) c_np_reshape = c_np.reshape( (N, CO // best_config['tile_oc'].size[-1], best_config['tile_oc'].size[-1], H, W)).transpose( (0, 1, 3, 4, 2)) a_tvm = tvm.nd.array(a_np_reshape, ctx=ctx) w_tvm = tvm.nd.array(w_np_reshape, ctx=ctx) c_tvm = tvm.nd.array(c_np_reshape, ctx=ctx) if tuple(arg_bufs[1].shape) == w_tvm.shape: func(c_tvm, w_tvm, a_tvm) else: func(c_tvm, a_tvm, w_tvm) try: tvm.testing.assert_allclose(c_np_reshape, c_tvm.asnumpy(), rtol=1e-2) except: print('WARNING: Not equal!') evaluator = func.time_evaluator(func.entry_name, ctx, repeat=3, number=4) if tuple(arg_bufs[1].shape) == w_tvm.shape: print(evaluator(c_tvm, w_tvm, a_tvm)) else: print(evaluator(c_tvm, a_tvm, w_tvm)) os.remove(log_filename) print(tvm.lower(s, arg_bufs, simple_mode=True)) if likwid_event != None: with open( 'data/likwid_rand_%s_%s_features_%icore_%i_%s.pkl' % (key, feature_type, num_threads, trials, sys.argv[3]), 'wb') as output: pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL) else: with open( 'data/%s_%s_features_%icore_%i_%s.pkl' % (key, feature_type, num_threads, trials, sys.argv[3]), 'wb') as output: pickle.dump([best_config, task, tuner.cost_model.saved_features], output, pickle.HIGHEST_PROTOCOL)
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) task = autotvm.task.create(schedule_conv2d_nhwc_auto, args=(batch, in_channel, in_size, num_filter, kernel, stride), target="cuda") print(task.config_space) # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler( logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all cpu cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=10)) tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=25, measure_option=measure_option, callbacks=[ autotvm.callback.log_to_file( 'conv2d_nhwc_{}.log'.format(in_size)) ]) with autotvm.apply_history_best('conv2d_nhwc_{}.log'.format(in_size)): with tvm.target.create(device): s, [A, W, B] = schedule_conv2d_nhwc_auto(batch, in_channel, in_size, num_filter, kernel, stride) func = tvm.build(s, [A, W, B], device, name=("ddd%dddd" % in_size)) @memoize("verify_nhwc") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(b_np.shape), dtype=dtype), ctx) func(a, w, b) timer_1 = func.time_evaluator(func.entry_name, ctx, number=10) tcost_1 = timer_1(a, w, b).mean np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) print("1x1 convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) task = autotvm.task.create(schedule_depthwise_conv2d_nhwc_reuse_auto, args=(batch, in_channel, in_size, channel_multiplier, kernel, stride), target="cuda") print(task) print(task.config_space) # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler( logging.StreamHandler(sys.stdout)) # There are two steps for measuring a config: build and run. # By default, we use all cpu cores to compile program. Then measure them sequentially. # We measure 5 times and take average to reduce variance. measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=10)) tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=25, measure_option=measure_option, callbacks=[ autotvm.callback.log_to_file( 'depthwise_conv2d_nhwc_{}.log'.format(in_size)) ]) with autotvm.apply_history_best( 'depthwise_conv2d_nhwc_{}.log'.format(in_size)): with tvm.target.create(device): s1, [Input, Filter, DepthwiseConv2d ] = schedule_depthwise_conv2d_nhwc_reuse_auto( batch, in_channel, in_size, channel_multiplier, kernel, stride) # s3 = schedule_depthwise_conv2d_nhwc_reuse(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device, name="ddd%dddd" % in_size) # f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) # f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure dtype = Input.dtype input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) # scale_shape = get_const_tuple(Scale.shape) # shift_shape = get_const_tuple(Shift.shape) # scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # scale_np = np.random.uniform(size=scale_shape).astype(dtype) # shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) # scale_shift_scipy = np.zeros(shape=scale_shift_shape) # for c in range(in_channel * channel_multiplier): # scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c] # relu_scipy = np.maximum(scale_shift_scipy, 0) # return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) return (input_np, filter_np, depthwise_conv2d_scipy) # Get the test data # (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() (input_np, filter_np, depthwise_conv2d_scipy) = get_ref_data() # prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) # scale_tvm = tvm.nd.array(scale_np, ctx) # shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) # scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) # relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=10) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) # timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=10) # tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) # timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=10) # tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean # relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) # np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) # np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print( "Depthwise convolution: average running time is {:.2f} us.".format( tcost_1 * 1e6))
def main(): with tvm.target.cuda(): with autotvm.apply_history_best(args.log_file): for batch in [1, 16]: for name in ['vgg-19', 'resnet-50', 'resnext-50', 'inception_v3', 'drn-c-26', 'dcn-resnet-101']: bench(name, batch)
def tvm_generic(N, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0, timeout=4, target="llvm", trials=100): data_shape = (N, C, H, W) data = relay.var("data", shape=data_shape, dtype="float32") kernel_size = (kernel_size, kernel_size) stride = (stride, stride) padding = (padding, padding) body = layers.conv2d(data=data, channels=K, kernel_size=kernel_size, strides=stride, padding=padding, name="conv2d") op = relay.Function(relay.ir_pass.free_vars(body), body) sym, params = create_workload(op) tasks = autotvm.task.extract_from_program(op, target=target, params=params, ops=(relay.op.nn.conv2d, )) tuning_option = { "log_filename": "tvm_baseline_{}.log".format( (N, C, H, W, K, kernel_size, stride, padding, dilation, groups)), "tuner": "xgb", "early_stopping": 30, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=timeout), runner=autotvm.LocalRunner(number=number, repeat=1, timeout=timeout, min_repeat_ms=150), # runner=autotvm.RPCRunner( # '1080ti', # change the device key to your key # '0.0.0.0', 9190, # number=20, repeat=3, timeout=4, min_repeat_ms=150) ), } log_filename = tuning_option["log_filename"] tuner = tuning_option["tuner"] early_stopping = tuning_option["early_stopping"] measure_option = tuning_option["measure_option"] # only support one task assert len(tasks) == 1 for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(task, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(task, pop_size=100) elif tuner == 'random': tuner_obj = RandomTuner(task) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(task) else: raise ValueError("Invalid tuner: " + tuner) # do tuning n_trial = trials length = len(task.config_space) print("config space length=", length) # tuner_obj.tune(n_trial=min(n_trial, length), # early_stopping=early_stopping, # measure_option=measure_option, # callbacks=[ # autotvm.callback.progress_bar(n_trial, prefix=prefix), # autotvm.callback.log_to_file(log_filename)]) if not os.path.exists(log_filename): raise RuntimeError( "the log file {} doesn't exists".format(log_filename)) with autotvm.apply_history_best(log_filename): with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build(op, target=target, params=params) ctx = tvm.context(str(target), 0) data_tvm = tvm.nd.array( (np.random.uniform(size=data_shape)).astype("float32")) module = runtime.create(graph, lib, ctx) module.set_input("data", data_tvm) module.set_input(**params) # evaluate ftimer = module.module.time_evaluator("run", ctx, number=number, repeat=1) prof_res = np.array(ftimer().results) * 1e3 return prof_res
runner=autotvm.LocalRunner(number=5)) # begin tuning, log records to file `matmul.log` tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=10, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) ######################################################################### # Finally we apply history best from the cache file and check its correctness. # We can call the function :code:`matmul` directly under the # :any:`autotvm.apply_history_best` context. When we call this function, # it will query the dispatch context with its argument and get the best config # with the same argument. # apply history best from log file with autotvm.apply_history_best('matmul.log'): with tvm.target.create("llvm"): s, arg_bufs = matmul(N, L, M, 'float32') func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, L)).astype(np.float32) b_np = np.random.uniform(size=(L, M)).astype(np.float32) c_np = a_np.dot(b_np) c_tvm = tvm.nd.empty(c_np.shape) func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
def run(name, N, H, W, CO, CI, KH, KW, stride, pad, dilation, trials=100, timeout=4, number=10, target="llvm", dev=0, tune=True): N, H, W, CO, CI, KH, KW, strides, padding = N, H, W, CO, CI, KH, KW, ( stride, stride), (pad, pad) task = autotvm.task.create(conv2d_nchw, args=(N, H, W, CO, CI, KH, KW, strides, padding, dilation), target=target) print("config_space length:", len(task.config_space)) logfile = "conv2d_" + name + "_{}".format( (N, CI, H, W, CO, KH, KW, stride, pad, dilation)) + ".log" # Use local gpu, measure 10 times for every config to reduce variance # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner( number=number, repeat=1, min_repeat_ms=150, timeout=timeout)) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) beg = time.time() print("Tune: ", tune) if tune: tuner.tune(n_trial=trials, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(logfile)]) end = time.time() ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best(logfile) best_config = dispatch_context.query(task.target, task.workload) print("Optimize use ", end - beg, "s") print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best(logfile): with tvm.target.create(target): s, arg_bufs = conv2d_nchw(N, H, W, CO, CI, KH, KW, strides, padding, dilation) # print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs, "cuda") print(func.imported_modules[0].get_source()) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) # c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.context(str(target), dev) a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty( (N, CO, (H + 2 * pad - dilation * (KH - 1) - 1) // stride + 1, (W + 2 * pad - dilation * (KW - 1) - 1) // stride + 1), ctx=ctx) # func(a_tvm, w_tvm, c_tvm) # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=number) cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3 return cost
target = tvm.target.cuda() size = 320 x, img = data.transforms.presets.ssd.load_test(im_fname, short=size) board = None device = 'gpu' #log_filename = '{}-{}.{}.{}.log'.format(model, size, board, device) log_filename = '{}.{}.{}.log'.format(model, board, device) with autotvm.apply_history_best(log_filename): loaded_lib = tvm.module.load("lib/{}.tvm.so".format(model)) loaded_json = open(("graph/{}.tvm.json".format(model))).read() # parameters in binary loaded_params = (bytearray(open("params/{}.tvm.params".format(model), "rb").read())) #nnvm.compiler.load_param_dict(loaded_params) fcreate = tvm.get_global_func("tvm.graph_runtime.create") ctx = tvm.gpu(0) #module = runtime.create(loaded_json, loaded_lib, ctx)
measure_option = autotvm.measure_option(mode='local', number=5) # begin tuning, log records to file `cache.tsv` tuner = autotvm.tuner.RandomTuner(task) tuner.tune(n_trial=10, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('cache.tsv')]) ######################################################################### # Finally we apply history best from the cache file and check its correctness. # We can call the function :code:`matmul` directly under the # :any:`autotvm.apply_history_best` context. When we call this function, # it will query the dispatch context with its argument and get the best config # with the same argument. # apply history best from log file with autotvm.apply_history_best('cache.tsv'): with tvm.target.create("llvm"): s, arg_bufs = matmul(N, L, M, 'float32') func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, L)).astype(np.float32) b_np = np.random.uniform(size=(L, M)).astype(np.float32) c_np = a_np.dot(b_np) c_tvm = tvm.nd.empty(c_np.shape) func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
def tune_and_evaluate(M, N, L, dtype, layout): task = autotvm.task.create("tutorial/auto_tensorcore/test_gemm", args=(N, L, M, dtype, layout), target='cuda') print(task.config_space) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) measure_option = autotvm.measure_option( builder='local', runner=autotvm.LocalRunner(number=5)) tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('matmul.log')]) dispatch_context = autotvm.apply_history_best("matmul.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best('matmul.log'): with tvm.target.create("cuda"): s, arg_bufs = test_gemm(N, L, M, dtype, layout) print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) dev_module = func.imported_modules[0] print(dev_module.get_source()) # check correctness if (layout == "NN"): shape_a = (N, L) shape_b = (L, M) elif (layout == "NT"): shape_a = (L, N) shape_b = (L, M) elif (layout == "TN"): shape_a = (N, L) shape_b = (M, L) elif (layout == "TT"): shape_a = (L, N) shape_b = (M, L) a_np = None b_np = None c_np = None c_np_type = None if dtype == 'float16': c_np_type = np.float32 a_np = np.random.uniform(size=shape_a).astype(np.float16) b_np = np.random.uniform(size=shape_b).astype(np.float16) if (layout == "NN"): c_np = np.dot(a_np, b_np) elif (layout == "NT"): c_np = np.dot(a_np.T, b_np) elif (layout == "TN"): c_np = np.dot(a_np, b_np.T) elif (layout == "TT"): c_np = np.dot(a_np.T, b_np.T) elif dtype == 'int8': c_np_type = np.int32 a_np = np.random.randint(low=-128, high=127, size=shape_a).astype(np.int8) b_np = np.random.randint(low=-128, high=127, size=shape_b).astype(np.int8) if (layout == "NN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32)) elif (layout == "NT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32)) elif (layout == "TN"): c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T) elif (layout == "TT"): c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T) elif dtype == 'int4': c_np_type = np.int32 a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32) b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32) # "TN" c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T) a_np = np.zeros(shape=(N, int(L/8)), dtype = np.int32) b_np = np.zeros(shape=(M, int(L/8)), dtype = np.int32) # a_np --> col_major for i in range(N): for j in range(int(L/8)): for k in range(8): a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4)) # b_np --> row_major for i in range(M): for j in range(int(L/8)): for k in range(8): b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4)) elif dtype == 'int1': c_np_type = np.int32 a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32) b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32) # "TN" c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T) a_np = np.zeros(shape=(N, int(L/32)), dtype = np.int32) b_np = np.zeros(shape=(M, int(L/32)), dtype = np.int32) for i in range(N): for j in range(int(L/32)): for k in range(32): a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k)) for i in range(M): for j in range(int(L/32)): for k in range(32): b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k)) c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx) a_tvm = tvm.nd.array(a_np, ctx=ctx) b_tvm = tvm.nd.array(b_np, ctx=ctx) func(a_tvm, b_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3) evaluator = func.time_evaluator(func.entry_name, ctx, number=100) print('Time cost of this operator: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)
def tune_kernels( N, H, W, CO, CI, KH, KW, strides, padding, dilation, trials, log_filename, so_file, measure_option, tuner, early_stopping, ): # N, H, W, CO, CI, KH, KW, strides, padding, dilation = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1,1) data = ('TENSOR', (N, CI, H, W), 'float32') kernel = ('TENSOR', (CO, CI, KH, KW), 'float32') # data = deserialize_args( ('TENSOR', tvm.placeholder((N, CI, H, W), dtype='float32', name='data')) ) # kernel = deserialize_args(('TENSOR',tvm.placeholder((CO, CI, KH, KW), dtype='float32', name='kernel')) ) origin_layout = 'NCHW' func_create = 'topi_x86_conv2d_NCHW_test' task = autotvm.task.create(func_create, args=(data, kernel, strides, padding, 1, origin_layout, 'float32'), target='llvm -mcpu=skylake-avx512', template_key='direct') # task.workload = ['float32', 'float32', H, W, CI, 1, CO, KH, KW, 1, 1, 1, 1] print(task.config_space) trials = min(trials, len(task.config_space)) # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task, loss_type='rank') # tuner.tune(n_trial=trials, # measure_option=measure_option, # callbacks=[ # autotvm.callback.progress_bar(trials), # autotvm.callback.log_to_file(log_filename)]) dispatch_context = autotvm.apply_history_best(log_filename) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) with autotvm.apply_history_best(log_filename): with tvm.target.create("llvm -mcpu=skylake-avx512"): s, arg_bufs = task.func(data, kernel, strides, padding, 1, origin_layout, 'float32', best_config) func = tvm.build(s, arg_bufs, "llvm -mcpu=skylake-avx512", name="fconv") print("arg_bufs 0", arg_bufs[0]) print("arg_bufs 1", arg_bufs[1]) print("arg_bufs 2", arg_bufs[2]) # print(func.get_source()) ''' dump = "%s.ll" % log_filename f = open(dump, "a") f.write(func.get_source()) f.close() ''' # path_dso = "/home/yufan/openmp-8.0.1.src/build/runtime/src/libomp.so" # m = tvm.module.load(path_dso) path_dso = "...your so file path" % so_file m = tvm.module.load(path_dso) fconv = m['fconv'] iteration = 50 a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) a_tvm = tvm.nd.array(a_np) w_tvm = tvm.nd.array(w_np) c_tvm = tvm.nd.empty(c_np.shape) print("\n============= Conti ====================\n") for x in range(0, iteration): fconv(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) print("\n============= Conti DONE====================\n") outH = arg_bufs[2].shape[2] outW = arg_bufs[2].shape[3] ctx = tvm.cpu() evaluator = func.time_evaluator(func.entry_name, ctx, number=500) time = evaluator(a_tvm, w_tvm, c_tvm).mean total_flop = 2 * N * outH * outW * CO * CI * KH * KW print('\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') print('total_flop : ', total_flop) print('Time cost of this operator: %f' % time) print('GLFOPs : %f', (total_flop / time / 1000 / 1000 / 1000)) print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n')
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda') print(task.config_space) measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4) ) log_name = 'gemm_int8.log' if DO_TUNING: tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=1000, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_name)]) dispatch_context = autotvm.apply_history_best(log_name) best_config = dispatch_context.query(task.target, task.workload) print('\nBest config:') print(best_config) else: config = task.config_space.get(PRETUNED_INDEX) dispatch_context = autotvm.task.ApplyConfig(config) print("Using pretuned config:") print(config) with dispatch_context: with tvm.target.create('cuda'): s, arg_bufs = gemm_int8(n, m, l) f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8') ctx = tvm.context('cuda', 0)
def tune(mod, params, X_ex): number = 10 repeat = 1 min_repeat_ms = 0 # since we're tuning on a CPU, can be set to 0 timeout = 10 # in seconds # create a TVM runner runner = autotvm.LocalRunner( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # Create a simple structure for holding tuning options. We use an XGBoost # algorithim for guiding the search. For a production job, you will want to set # the number of trials to be larger than the value of 10 used here. For CPU we # recommend 1500, for GPU 3000-4000. The number of trials required can depend # on the particular model and processor, so it's worth spending some time # evaluating performance across a range of values to find the best balance # between tuning time and model optimization. Because running tuning is time # intensive we set number of trials to 10, but do not recommend a value this # small. The ``early_stopping`` parameter is the minimum number of trails to # run before a condition that stops the search early can be applied. The # measure option indicates where trial code will be built, and where it will be # run. In this case, we're using the ``LocalRunner`` we just created and a # ``LocalBuilder``. The ``tuning_records`` option specifies a file to write # the tuning data to. tuning_option = { "tuner": "xgb", "trials": 10, "early_stopping": 100, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner ), "tuning_records": "resnet-50-v2-autotuning.json", } tasks = autotvm.task.extract_from_program(mod["main"], target=TARGET, params=params) for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) tuner_obj = XGBTuner(task, loss_type="rank") tuner_obj.tune( n_trial=min(tuning_option["trials"], len(task.config_space)), early_stopping=tuning_option["early_stopping"], measure_option=tuning_option["measure_option"], callbacks=[ autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix), autotvm.callback.log_to_file(tuning_option["tuning_records"]), ], ) with autotvm.apply_history_best(tuning_option["tuning_records"]): with tvm.transform.PassContext(opt_level=3, config={}): lib = relay.build(mod, target=target, params=params) dev = tvm.device(str(target), 0) optimized_module = graph_executor.GraphModule(lib["default"](dev)) optimized_module.set_input("input0", X_ex) optimized_module.run() # dry run test return optimized_module
# # [Task 22/24] Current/Best: 13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done. # # [Task 23/24] Current/Best: 53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done. # # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done. ################################################################################ # Compiling an Optimized Model with Tuning Data # ---------------------------------------------- # # As an output of the tuning process above, we obtained the tuning records # stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to # generate high performance code for the model on your specified target. # # Now that tuning data for the model has been collected, we can re-compile the # model using optimized operators to speed up our computations. with autotvm.apply_history_best(tuning_option["tuning_records"]): with tvm.transform.PassContext(opt_level=3, config={}): lib = relay.build(mod, target=target, params=params) dev = tvm.device(str(target), 0) module = graph_executor.GraphModule(lib["default"](dev)) ################################################################################ # Verify that the optimized model runs and produces the same results: dtype = "float32" module.set_input(input_name, img_data) module.run() output_shape = (1, 1000) tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).asnumpy()
# You can use alternatives like XGBTuner. tuner = autotvm.tuner.RandomTuner(task) tuner.tune( n_trial=10, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file("matmul.log")], ) ######################################################################### # Finally we apply history best from the cache file and check its correctness. # We can call the function :code:`matmul` directly under the # :any:`autotvm.apply_history_best` context. When we call this function, # it will query the dispatch context with its argument and get the best config # with the same argument. # apply history best from log file with autotvm.apply_history_best("matmul.log"): with tvm.target.Target("llvm"): s, arg_bufs = matmul(N, L, M, "float32") func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, L)).astype(np.float32) b_np = np.random.uniform(size=(L, M)).astype(np.float32) c_np = a_np.dot(b_np) c_tvm = tvm.nd.empty(c_np.shape) func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
# Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune( n_trial=20, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file("conv2d.log")], ) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best("conv2d.log"): with tvm.target.Target("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
lat = (end - beg) * 1e3 if lat >= min_repeat_ms: break number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) print('mxnet mean lat: %.2f ms' % (lat / number)) mod, params = relay.frontend.from_mxnet(mx_net, shape_dict) ctx = tvm.cpu() if args.arm: target = "llvm -device=arm_cpu -target=aarch64-linux-gnu" else: target = "llvm -mcpu=skylake-avx512" log_path = "autotvm_logs" logs = [os.path.join(log_path, f) for f in os.listdir(log_path)] autotvm_ctx = autotvm.apply_history_best(None) for log_file in logs: autotvm_ctx.load(log_file) # apply logs print("Compile...") with autotvm_ctx: with relay.build_config(opt_level=3): graph, lib, params = relay.build(mod[mod.entry_func], target, params=params) # benchmark print("Check correctness...") ex = runtime.create(graph, lib, ctx) ex.set_input(data0=inputs, data1=token_types, data2=valid_length, **params) ex.run() out = ex.get_output(0)
print("Compiling the model...") out = 'yolov3.tx2.gpu' ins = 'yolov3.x86.gpu' graph = load_tvm_graph('graph/{}'.format(ins)) params = load_tvm_params('params/{}'.format(ins)) symbol = graph.symbol [neth, netw] = shape['data'][2:] # Current image shape is 608x608 <<<<<<< HEAD with autotvm.apply_history_best('yolov3-darknet.tx2.gpu.log'): with nnvm.compiler.build_config(opt_level = 2): graph, lib, params = nnvm.compiler.build(symbol, target, shape, dtype = dtype_dict, params = params) ======= with nnvm.compiler.build_config(opt_level = 2): graph, lib, params = nnvm.compiler.build(symbol, target, shape, dtype = dtype_dict, params = params) >>>>>>> 3df6457f817f3ee5923f83d0c9377e0a1a19fc2e ###################################################################### # Load a test image # -------------------------------------------------------------------- test_image = 'dog.jpg' print("Loading the test image...") img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \ test_image + '?raw=true'
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host, shape={'data': input_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d, nnvm.sym.dense)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, target_host=target_host, shape={'data': input_shape}, params=params, dtype=dtype) # export library tmp = tempdir() if use_android: from tvm.contrib import ndk filename = "net.so" lib.export_library(tmp.relpath(filename), ndk.create_shared) else: filename = "net.tar" lib.export_library(tmp.relpath(filename)) # upload module to device print("Upload...") remote = autotvm.measure.request_remote(device_key, 'localhost', 9190, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) # upload parameters to device ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=50, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
) # Begin tuning, log records to file `conv2d.log` # During tuning we will also try many invalid configs, so you are expected to # see many error reports. As long as you can see non-zero GFLOPS, it is okay. tuner = autotvm.tuner.XGBTuner(task) tuner.tune(n_trial=20, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file('conv2d.log')]) ######################################################################### # Finally we can inspect the best config from log file, check correctness, # and measure running time. # inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best('conv2d.log'): with tvm.target.create("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding)