def quantize_relay_module(mod, params, qconfig=None): """ Quantize the relay module with qconfig options. Parameters: ------ mod : tvm.relay.module The original module. qconfig : tvm.relay.quantize.quantize.QConfig The quantization configuration Returns: ------ qfunc : vm.relay.expr.Function The graph after quantization """ # default qconfig if not qconfig: qconfig = qtz.qconfig() with qconfig: logging.debug('current quantize config') logging.debug(qtz.current_qconfig()) mod['main'] = qtz.quantize(mod['main'], params=params) logging.debug('after quantize') logging.debug(mod['main'].astext(show_meta_data=False)) return mod
def get_model(model_name, batch_size, qconfig, target=None, original=False, simulated=False): gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True) img_size = 299 if model_name == 'inceptionv3' else 224 data_shape = (batch_size, 3, img_size, img_size) mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape}) net = mod['main'] with tvm.transform.PassContext(opt_level=3): qfunc = relay.quantize.prerequisite_optimize(net, params=params) logging.debug('original') logging.debug(qfunc.astext(show_meta_data=False)) if original: return qfunc with qconfig: logging.debug('current quantize config') logging.debug(qtz.current_qconfig()) qfunc = qtz.quantize(qfunc) logging.debug('after quantize') logging.debug(qfunc.astext(show_meta_data=False)) return qfunc
def get_onnx_model(model_name, batch_size, qconfig, original=False, dataset=None): assert model_name == "vit32", "Only support vit32 model!" base = "https://github.com/TheGreatCold/tvm-vit/raw/d2aa1e60eef42e2fdedbd1e13aa85ac5faf0a7fc" logfile = "gtx1660_vit_B32_224.log" onnx_path = "vit_B32_224.onnx" download_file(base, logfile) download_file(base, onnx_path) onnx_graph = onnx.load(open(onnx_path, "rb")) data_shape = (batch_size, 3, 224, 224) mod, params = relay.frontend.from_onnx(onnx_graph, {"data": data_shape}) with tvm.transform.PassContext(opt_level=3): qfunc = relay.quantize.prerequisite_optimize(mod, params=params) logging.debug("original") logging.debug(qfunc.astext(show_meta_data=False)) if original: return qfunc, params, logfile with qconfig: logging.debug("current quantize config") logging.debug(qtz.current_qconfig()) if dataset is not None: with tvm.target.cuda(): with tvm.autotvm.apply_history_best(logfile): qfunc = qtz.quantize(qfunc, params, dataset=dataset) else: qfunc = qtz.quantize(qfunc, params) logging.debug("after quantize") logging.debug(qfunc.astext(show_meta_data=False)) return qfunc, params, logfile
def get_model(model_name, batch_size, qconfig, target=None, original=False, simulated=False): gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True) img_size = 299 if model_name == 'inceptionv3' else 224 input_shape = (batch_size, 3, img_size, img_size) mod, params = relay.frontend.from_mxnet(gluon_model, {"data": input_shape}) qfunc = mod['main'] start_time = time.time() with relay.build_config(opt_level=3): qfunc = relay.quantize.prerequisite_optimize(qfunc, params=params) logging.debug('original') logging.debug(qfunc.astext(show_meta_data=False)) if original: return qfunc with qconfig: logging.debug('current quantize config') logging.debug(qtz.current_qconfig()) qfunc = qtz.quantize(qfunc,params=params) logging.debug('after quantize') logging.debug(qfunc.astext(show_meta_data=False)) # os._exit(-1) return qfunc, params, input_shape
def build_model(args, gluon_model): """Build with relay.""" import tvm from tvm import relay from tvm.relay import quantize as qtz img_size = 299 if args.model == 'inceptionv3' else 224 data_shape = (args.batch_size, 3, img_size, img_size) net, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape}) target = args.target if args.original: # run original model with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target, params=params) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx # constant folding and scale folding. print('original') print(net.astext(show_meta_data=False)) with relay.build_config(opt_level=3): qgraph = relay.optimize(net, target, params) # qgraph = relay.optimize(qgraph) print('after optimize') print(qgraph.astext(show_meta_data=False)) with qtz.qconfig(skip_k_conv=0, nbit_input=args.nbit_input, nbit_weight=args.nbit_input, global_scale=args.global_scale, dtype_input=args.dtype_input, dtype_weight=args.dtype_input, dtype_activation=args.dtype_output, store_lowbit_output=False, debug_enabled_ops=None): print(qtz.current_qconfig()) qgraph = qtz.annotate(qgraph) print('after annotate') print(qgraph.astext(show_meta_data=False)) qgraph = qtz.calibrate(qgraph) print('after calibrate\n') print(qgraph.astext(show_meta_data=False)) if not args.simulated: qgraph = qtz.realize(qgraph) qgraph = relay.ir_pass.infer_type(qgraph) print('after realize\n') print(qgraph.astext(show_meta_data=False)) with relay.build_config(opt_level=3): graph, lib, params = relay.build(qgraph, target) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx
def get_model(model_name, batch_size, qconfig, original=False): gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True) img_size = 299 if model_name == "inceptionv3" else 224 data_shape = (batch_size, 3, img_size, img_size) mod, params = relay.frontend.from_mxnet(gluon_model, {"data": data_shape}) logging.debug("original") logging.debug(mod.astext(show_meta_data=False)) if original: return mod, params with qconfig: logging.debug("current quantize config") logging.debug(qtz.current_qconfig()) qfunc = qtz.quantize(mod, params) logging.debug("after quantize") logging.debug(qfunc.astext(show_meta_data=False)) return qfunc, params
def quantize_model(args): """Build with relay.""" import tvm from tvm import relay from tvm.relay import quantize as qtz img_size = 224 data_shape = (args.batch_size, 3, img_size, img_size) mx_sym, mx_args, mx_auxs = mx.model.load_checkpoint(args.model, 0) net, params = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, arg_params=mx_args, aux_params=mx_auxs) target = args.target if args.original: # run original model with relay.build_config(opt_level=3): graph, lib, params = relay.build(net, target, params=params) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx # constant folding and scale folding. # print('original') # print(net.astext(show_meta_data=False)) with relay.build_config(opt_level=3): qgraph = relay.optimize(net, target, params) # print('after optimize') # print(qgraph.astext(show_meta_data=False)) with qtz.qconfig(skip_k_conv=0, nbit_input=args.nbit_input, nbit_weight=args.nbit_input, global_scale=args.global_scale, dtype_input=args.dtype_input, dtype_weight=args.dtype_input, dtype_activation=args.dtype_output, store_lowbit_output=False, debug_enabled_ops=None): print(qtz.current_qconfig()) qgraph = qtz.annotate(qgraph) # print('after annotate') # print(qgraph.astext(show_meta_data=False)) qgraph = qtz.calibrate(qgraph) # print('after calibrate\n') # print(qgraph.astext(show_meta_data=False)) if not args.simulated: qgraph = qtz.realize(qgraph) qgraph = relay.ir_pass.infer_type(qgraph) # print('after realize\n') # print(qgraph.astext(show_meta_data=False)) with relay.build_config(opt_level=3): graph, lib, params = relay.build(qgraph, target) ### save/load the graph, lib and params into separate files # save lib.export_library(os.path.join(thisdir, "deploy_lib.so")) with open(os.path.join(thisdir, "deploy_graph.json"), "w") as fo: fo.write(graph) with open(os.path.join(thisdir, "deploy_param.params"), "wb") as fo: fo.write(relay.save_param_dict(params)) # load graph = open(os.path.join(thisdir, "deploy_graph.json")).read() lib = tvm.module.load(os.path.join(thisdir, "deploy_lib.so")) params = bytearray( open(os.path.join(thisdir, "deploy_param.params"), "rb").read()) ctx = tvm.nd.context(target, 0) return graph, lib, params, ctx