def test_save_and_load_model(self): # type: () -> None proto = self._simple_model() cls = ModelProto proto_string = onnx._serialize(proto) # Test if input is string loaded_proto = onnx.load_model_from_string(proto_string) self.assertTrue(proto == loaded_proto) # Test if input has a read function f = io.BytesIO() onnx.save_model(proto_string, f) f = io.BytesIO(f.getvalue()) loaded_proto = onnx.load_model(f, cls) self.assertTrue(proto == loaded_proto) # Test if input is a file name try: fi = tempfile.NamedTemporaryFile(delete=False) onnx.save_model(proto, fi) fi.close() loaded_proto = onnx.load_model(fi.name, cls) self.assertTrue(proto == loaded_proto) finally: os.remove(fi.name)
def run(args): onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx')) symbol, params = nnvm.frontend.from_onnx(onnx_model) input_names = symbol.list_input_names() output_names = symbol.list_output_names() test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = load_test_data(test_data_dir, input_names, output_names) inputs = dict(inputs) # assert len(input_names) == len(inputs) + len(params) # assert len(output_names) == len(outputs) graph, lib, params = compile( symbol, args.target, input_names, inputs, params, args.opt_level, args.autotvm_log) if args.dump_nnvm: print(graph.ir()) print(graph.json()) ctx = tvm.gpu() # Prepare inputs. tvm_inputs = {} for name, value in inputs.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) for name, value in params.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**tvm_inputs) graph_module.run() for i, (name, expected) in enumerate(outputs): tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx) actual = graph_module.get_output(i, tvm_output).asnumpy() np.testing.assert_allclose(expected, actual, rtol=1e-3, atol=1e-4), name print('%s: OK' % name) print('ALL OK') if args.iterations > 1: num_iterations = args.iterations - 1 start = time.time() for t in range(num_iterations): graph_module.run() cupy.cuda.device.Device().synchronize() elapsed = time.time() - start print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
def import_to_gluon(model_file, ctx): """ Imports the ONNX model files, passed as a parameter, into Gluon SymbolBlock object. Parameters ---------- model_file : str ONNX model file name ctx : Context or list of Context Loads the model into one or many context(s). Returns ------- sym_block : :class:`~mxnet.gluon.SymbolBlock` A SymbolBlock object representing the given model file. Notes ----- This method is available when you ``import mxnet.contrib.onnx`` """ graph = GraphProto() try: import onnx except ImportError: raise ImportError("Onnx and protobuf need to be installed. Instructions to" + " install - https://github.com/onnx/onnx#installation") model_proto = onnx.load_model(model_file) net = graph.graph_to_gluon(model_proto.graph, ctx) return net
def get_model_metadata(model_file): """ Returns the name and shape information of input and output tensors of the given ONNX model file. Notes ----- This method is available when you ``import mxnet.contrib.onnx`` Parameters ---------- model_file : str ONNX model file name Returns ------- model_metadata : dict A dictionary object mapping various metadata to its corresponding value. The dictionary will have the following template:: 'input_tensor_data' : list of tuples representing the shape of the input paramters 'output_tensor_data' : list of tuples representing the shape of the output of the model """ graph = GraphProto() try: import onnx except ImportError: raise ImportError("Onnx and protobuf need to be installed. " + "Instructions to install - https://github.com/onnx/onnx") model_proto = onnx.load_model(model_file) metadata = graph.get_graph_metadata(model_proto.graph) return metadata
def test_import_export(self): for test in test_cases: test_name, mxnet_op, onnx_name, inputs, attrs, mxnet_specific, fix_attrs, check_value, check_shape = test with self.subTest(test_name): names, input_tensors, inputsym = get_input_tensors(inputs) if inputs: test_op = mxnet_op(*inputsym, **attrs) mxnet_output = forward_pass(test_op, None, None, names, inputs) outputshape = np.shape(mxnet_output) else: test_op = mxnet_op(**attrs) shape = attrs.get('shape', (1,)) x = mx.nd.zeros(shape, dtype='float32') xgrad = mx.nd.zeros(shape, dtype='float32') exe = test_op.bind(ctx=mx.cpu(), args={'x': x}, args_grad={'x': xgrad}) mxnet_output = exe.forward(is_train=False)[0].asnumpy() outputshape = np.shape(mxnet_output) if mxnet_specific: onnxmodelfile = onnx_mxnet.export_model(test_op, {}, [np.shape(ip) for ip in inputs], np.float32, onnx_name + ".onnx") onnxmodel = load_model(onnxmodelfile) else: onnx_attrs = _fix_attributes(attrs, fix_attrs) onnxmodel = get_onnx_graph(test_name, names, input_tensors, onnx_name, outputshape, onnx_attrs) bkd_rep = backend.prepare(onnxmodel, operation='export') output = bkd_rep.run(inputs) if check_value: npt.assert_almost_equal(output[0], mxnet_output) if check_shape: npt.assert_equal(output[0].shape, outputshape) input1 = get_rnd((1, 10, 2, 3)) ipsym = mx.sym.Variable("input1") for test in test_scalar_ops: if test == 'Add': outsym = 2 + ipsym if test == "Sub": outsym = ipsym - 2 if test == "rSub": outsym = ipsym.__rsub__(2) if test == "Mul": outsym = 2 * ipsym if test == "Div": outsym = ipsym / 2 if test == "Pow": outsym = ipsym ** 2 forward_op = forward_pass(outsym, None, None, ['input1'], input1) converted_model = onnx_mxnet.export_model(outsym, {}, [np.shape(input1)], np.float32, onnx_file_path=outsym.name + ".onnx") sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model) result = forward_pass(sym, arg_params, aux_params, ['input1'], input1) npt.assert_almost_equal(result, forward_op)
def verify_onnx_forward_impl(graph_file, data_shape, out_shape): dtype = 'float32' x = np.random.uniform(size=data_shape) model = onnx.load_model(graph_file) c2_out = get_caffe2_output(model, x, dtype) for target, ctx in ctx_list(): tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype) tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
def test_exports(self): input_shape = (2,1,3,1) for test in export_test_cases: test_name, onnx_name, mx_op, attrs = test input_sym = mx.sym.var('data') outsym = mx_op(input_sym, **attrs) converted_model = onnx_mxnet.export_model(outsym, {}, [input_shape], np.float32, onnx_file_path=outsym.name + ".onnx") model = load_model(converted_model) checker.check_model(model)
def compare_graph(onnx_file, nnvm_sym, ishape): onnx_model = onnx.load_model(onnx_file) onnx_sym, params = nnvm.frontend.from_onnx(onnx_model) g1 = nnvm.graph.create(onnx_sym) g2 = nnvm.graph.create(nnvm_sym) input_name = onnx_model.graph.input[0].name ishapes = {input_name: ishape} graph_attr.set_shape_inputs(g1, ishapes) graph_attr.set_shape_inputs(g2, ishapes) g1 = g1.apply("InferShape").apply("SimplifyInference") g2 = g2.apply("InferShape").apply("SimplifyInference") graph_util.check_graph_equal(g1, g2)
def get_network(name, batch_size): """Get the symbol definition and random weight of a network""" input_shape = (batch_size, 3, 224, 224) output_shape = (batch_size, 1000) if "resnet" in name: n_layer = int(name.split('-')[1]) net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size) elif "vgg" in name: n_layer = int(name.split('-')[1]) net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size) elif name == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size) elif name == 'squeezenet_v1.1': net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1') elif name == 'inception_v3': input_shape = (1, 3, 299, 299) net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size) elif name == 'custom': # an example for custom network from nnvm.testing import utils net = nnvm.sym.Variable('data') net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1)) net = nnvm.sym.flatten(net) net = nnvm.sym.dense(net, units=1000) net, params = utils.create_workload(net, batch_size, (3, 224, 224)) elif name == 'mxnet': # an example for mxnet model from mxnet.gluon.model_zoo.vision import get_model block = get_model('resnet18_v1', pretrained=True) net, params = nnvm.frontend.from_mxnet(block) net = nnvm.sym.softmax(net) else: onnx_model = onnx.load_model( 'out/models/resnet50_conv_bs1_0/model.onnx') net, params = nnvm.frontend.from_onnx(onnx_model) output_shape = (batch_size, 6, 112, 112) return net, params, input_shape, output_shape
def import_model(model_file): """Imports the ONNX model file, passed as a parameter, into MXNet symbol and parameters. Operator support and coverage - https://cwiki.apache.org/confluence/display/MXNET/MXNet-ONNX+Integration Parameters ---------- model_file : str ONNX model file name Returns ------- sym : :class:`~mxnet.symbol.Symbol` MXNet symbol object arg_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray` Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format aux_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray` Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format Notes ----- This method is available when you ``import mxnet.contrib.onnx`` """ graph = GraphProto() try: import onnx except ImportError: raise ImportError("Onnx and protobuf need to be installed. " + "Instructions to install - https://github.com/onnx/onnx") # loads model file and returns ONNX protobuf object model_proto = onnx.load_model(model_file) sym, arg_params, aux_params = graph.from_onnx(model_proto.graph) return sym, arg_params, aux_params
def optimize_model(input, model_type='bert', num_heads=12, hidden_size=768, optimization_options=None, opt_level=0, use_gpu=False, only_onnxruntime=False): """ Optimize Model by OnnxRuntime and/or offline fusion logic. The following optimizes model by OnnxRuntime only, and no offline fusion logic: optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True) If you want to optimize model by offline fusion logic. optimize_model(input, model_type, num_heads=12, hidden_size=768, optimization_options=your_options) Args: input (str): input model path. model_type (str): model type - like bert, bert_tf, bert_keras or gpt2. num_heads (int): number of attention heads. hidden_size (int): hidden size. optimization_options (OptimizationOptions or None): optimization options that can use to turn on/off some fusions. opt_level (int): onnxruntime graph optimization level (0, 1, 2 or 99). When the level > 0, onnxruntime will be used to optimize model first. use_gpu (bool): use gpu or not for onnxruntime. only_onnxruntime (bool): only use onnxruntime to optimize model, and no offline fusion logic is used. Returns: object of an optimizer class. """ (optimizer_class, producer, run_onnxruntime) = MODEL_CLASSES[model_type] temp_model_path = None if opt_level > 1: # Optimization specified for an execution provider. temp_model_path = optimize_by_onnxruntime(input, use_gpu=use_gpu, opt_level=opt_level) elif run_onnxruntime: # Use Onnxruntime to do optimizations (like constant folding and cast elimation) that is not specified to exection provider. # CPU provider is used here so that there is no extra node for GPU memory copy. temp_model_path = optimize_by_onnxruntime(input, use_gpu=False, opt_level=1) model = load_model(temp_model_path or input, format=None, load_external_data=True) if model.producer_name and producer != model.producer_name: logger.warning( f"Model producer not matched: Expect {producer}, Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter." ) if optimization_options is None: optimization_options = BertOptimizationOptions(model_type) optimizer = optimizer_class(model, num_heads, hidden_size) if not only_onnxruntime: optimizer.optimize(optimization_options) # Remove the temporary model. if temp_model_path: os.remove(temp_model_path) logger.debug("Remove tempoary model: {}".format(temp_model_path)) optimizer.model.producer_name = "onnxruntime_tools" optimizer.model.producer_version = "1.4" return optimizer
from mxnet import autograd, np, npx, gluon, init from onnx import checker import onnx from mxnet.contrib import onnx as onnx_mxnet npx.set_np() sym = './test-symbol.json' params = './test-0010.params' input_shape = (1, 1, 28, 28) onnx_file = './test.onnx' converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file, 1) # Load onnx model model_proto = onnx.load_model(converted_model_path) # Check if converted ONNX protobuf is valid checker.check_graph(model_proto.graph)
def test_check_model_by_model(self): # type: () -> None model = onnx.load_model(self.model_filename, load_external_data=False) with pytest.raises(ValueError): load_external_data_for_model( model, self.temp_dir) # Exceeds maximum protobuf checker.check_model(model) # checker catches 2GB models as well
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import onnx import onnx.utils import sys model_file = sys.argv[1] print("-- Opening ONNX file=%s" % model_file) model = onnx.load_model(model_file) # type: onnx.ModelProto print("-- ONNX OpSet=%s" % model.opset_import) print("-- ONNX model - Number of nodes=%d" % len(model.graph.node)) print() print("-- Begin ONNX model --") # Print a human readable representation of the model graph print(onnx.helper.printable_graph(model.graph)) print("-- End ONNX model --") print() onnx.checker.check_model(model) print('-- ONNX model validated OK') print()
def check_lstm_with_type(lstm_type, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)): has_proj = "p" in lstm_type device = torch.device("cpu") hidden_layers_num = 1 model = None for batch_first in (True, False): for use_bias in (True, False): for rnd_weights in (True, False): if lstm_type == "uni": model = LSTM_Model( device, batch_first=batch_first, rnd_weights_init=rnd_weights, use_bias=use_bias, ) elif lstm_type == "b": model = LSTM_Model( device, batch_first=batch_first, bidirectional=True, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = 2 elif lstm_type == "p": model = LSTM_Model( device, batch_first=batch_first, proj_size=projection_size, rnd_weights_init=rnd_weights, use_bias=use_bias, ) elif lstm_type == "s": model = LSTM_Model( device, batch_first=batch_first, layer_num=model_num_layers, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = model_num_layers elif lstm_type == "sb": model = LSTM_Model( device, batch_first=batch_first, bidirectional=True, layer_num=model_num_layers, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = 2 * model_num_layers elif lstm_type == "sp": model = LSTM_Model( device, batch_first=batch_first, layer_num=model_num_layers, proj_size=projection_size, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = model_num_layers elif lstm_type == "bp": model = LSTM_Model( device, batch_first=batch_first, bidirectional=True, proj_size=projection_size, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = 2 elif lstm_type == "sbp": model = LSTM_Model( device, batch_first=batch_first, bidirectional=True, layer_num=model_num_layers, proj_size=projection_size, rnd_weights_init=rnd_weights, use_bias=use_bias, ) hidden_layers_num = 2 * model_num_layers else: print( "WARNING: LSTM type {} is not supported here!".format( lstm_type)) return model.eval() # Get golden output from original model input_hidden_shape = (hidden_layers_num, batch_size, model_hidden_size) input_hidden_shape_with_proj = (hidden_layers_num, batch_size, projection_size) dummy_input, input_shape = model.get_dummy_input() golden_output_batch = model.forward( dummy_input.to(device)).detach().cpu().numpy() dtype = "float32" h_zeros = np.zeros(input_hidden_shape, dtype=dtype) if has_proj: h_zeros = np.zeros(input_hidden_shape_with_proj, dtype=dtype) c_zeros = np.zeros(input_hidden_shape, dtype=dtype) tvm_output = None for format in ("ts", "onnx"): if format == "ts": # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. traced_script_module = torch.jit.trace( model, dummy_input).eval() # Import model to Relay shape_list = [("input", input_shape)] mod, params = relay.frontend.from_pytorch( traced_script_module, shape_list) # Model compilation by tvm with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) elif format == "onnx": if has_proj: print( "WARNING: torch.onnx.export does not support conversion LSTM with projection " "from pytorch! TODO: waiting for the support and correct test after that." ) continue onnx_io = io.BytesIO() with torch.no_grad(): h0 = torch.rand(input_hidden_shape) if has_proj: h0 = torch.rand(input_hidden_shape_with_proj) c0 = torch.rand(input_hidden_shape) input_names = ["input", "h0", "c0"] # default export (without dynamic input) torch.onnx.export(model, (dummy_input, (h0, c0)), onnx_io, input_names=input_names) onnx_io.seek(0, 0) onnx_model = onnx.load_model(onnx_io) # Import model to Relay shape_dict = { "input": input_shape, "h0": input_hidden_shape, "c0": input_hidden_shape, } if has_proj: shape_dict = { "input": input_shape, "h0": input_hidden_shape_with_proj, "c0": input_hidden_shape, } mod, params = relay.frontend.from_onnx( onnx_model, shape_dict) # Model compilation by tvm with tvm.transform.PassContext(opt_level=1): lib = relay.build(mod, target=target, params=params) # Inference of the model with given input data m = graph_executor.GraphModule(lib["default"](dev)) # Set inputs m.set_input( input=tvm.nd.array(dummy_input.numpy().astype(dtype)), h0=tvm.nd.array(h_zeros), c0=tvm.nd.array(c_zeros), ) # Execute m.run() # Get outputs (converted to numpy array) tvm_output = m.get_output(0).numpy() compare(tvm_output, golden_output_batch)
import onnx import sys size = 256 if __name__ == "__main__": if len(sys.argv) == 1: print("缺少模型文件名") else: model_name = sys.argv[1] print("model name: " + model_name) model = onnx.load_model(model_name) d = model.graph.input[0].type.tensor_type.shape.dim print(d) d[2].dim_value = size d[3].dim_value = size for output in model.graph.output: d = output.type.tensor_type.shape.dim d[2].dim_value = size d[3].dim_value = size print(d) onnx.save_model(model,"convert.onnx" )
import onnx import math input_size = (480, 640) # input_size = (720, 1280) model = onnx.load_model( "/home/nano/workspace/CenterFace/models/onnx/centerface.onnx") d = model.graph.input[0].type.tensor_type.shape.dim print(d) rate = ( int(math.ceil(input_size[0] / d[2].dim_value)), int(math.ceil(input_size[1] / d[3].dim_value)), ) print("rare", rate) d[0].dim_value = 1 d[2].dim_value *= rate[0] d[3].dim_value *= rate[1] for output in model.graph.output: d = output.type.tensor_type.shape.dim print(d) d[0].dim_value = 1 d[2].dim_value *= rate[0] d[3].dim_value *= rate[1] onnx.save_model( model, "/home/nano/workspace/CenterFace/models/onnx/centerface_480_640.onnx") # onnx.save_model(model, "/home/nano/workspace/CenterFace/models/onnx/centerface_720_1280.onnx") print("Conversion done!")
# # It is very important to check numpy arrays shape print('layer 0 : ', len(layer0), layer1.shape, type(layer0)) print('layer 1 : ', len(layer1), layer2.shape, type(layer1)) print('layer 2 : ', len(layer2), layer3.shape, type(layer2)) print('layer 3 : ', len(layer3), layer4.shape, type(layer3)) print('layer 4 : ', len(layer4), layer5.shape, type(layer4)) print('layer 5 : ', len(layer5), layer6.shape, type(layer5)) print('layer 6 : ', len(layer6), layer7.shape, type(layer6)) print('layer 7 : ', len(layer7), layer8.shape, type(layer7)) print('layer 8 : ', len(layer8), layer9.shape, type(layer8)) print('layer 9 : ', len(layer9), layer10.shape, type(layer9)) print('layer 10 : ', len(layer10), layer10.shape, type(layer10)) # onnx model load onnx_model = onnx.load_model(ONNX_MODEL_PATH) # onnx_graph information extraction onnx_weights = onnx_model.graph.initializer # Also, Checking ONNX model's weights shape is very important, # this is because they have some different shape both. # ------------------------------------- # ------ Layer shape information ------ # SNN ONNX # 34832, # 3, 3, 1, 32 784, 10 # 32, 10 # 3, 3, 32, 64 3200, 784 # 64, 784 # 3, 3, 64, 128 128, 64, 3, 3
def load(self): if not self.exist(): self._download() return onnx.load_model(self.model_path())
def from_onnx(fname: str, config: Config) -> nx.DiGraph: # load onnx graph into memory model = onnx.load_model(fname) # check optimize and infer shapes #polished_model = onnx.utils.polish_model(model) polished_model = model initializers = {} value_info = {} io_map = {} # this will capture all of the graph for init in polished_model.graph.initializer: initializers[init.name] = init logging.log(logging.DEBUG, f"Registered initializer: {init.name}") # this captures all internal values, but not the graph output for some # reason (onnx spec is strange) #for vi in polished_model.graph.value_info: # value_info[vi.name] = vi # logging.log(logging.DEBUG, f"Registered value info: {vi.name}") ## this captures the graph output #for vi in polished_model.graph.output: # value_info[vi.name] = vi # logging.log(logging.DEBUG, f"Registered value info: {vi.name} (out)") # this captures all model inputs for inp in polished_model.graph.input: new_io = InOut(inp.name, None, None, None) # directly convert onnx initializers to static IOs in the graph if inp.name in initializers: new_io.kind = "static" new_io.data = numpy_helper.to_array(initializers[inp.name]).astype( np.float32 ) new_io.shape = np.shape(new_io.data) # pointers will be allocated later by the allocate pass else: new_io.kind = "pointer" new_io.data = None new_io.shape = onnx_type_to_shape(inp.type, config.user_width) io_map[inp.name] = new_io logging.log(logging.DEBUG, f"Built IO: {new_io}") # Create IOs for all node outputs for node in polished_model.graph.node: for out in node.output: new_io = InOut(out, None, None, None) new_io.kind = "pointer" new_io.data = None #new_io.shape = onnx_type_to_shape(value_info[out].type, config.user_width) new_io.shape = None io_map[out] = new_io logging.log(logging.DEBUG, f"Built IO: {new_io}") # at this point all inputs and outputs are availiable graph = nx.DiGraph() # usage map holds the uses of all of the _pointer_ IOs in the graph # eg IO : {use = [node2, node3], def = [node1]} # pointer IOs represent graph edges usage_map = {} for io_name, io_v in io_map.items(): if io_v.kind == "pointer": usage_map[io_name] = {"use": [], "def": []} # start numbering nodes at zero node_id = 0 # attach a load node for each of the dynamic inputs for dyninp_vi in polished_model.graph.input: if dyninp_vi.name not in initializers: built_node = build_load_node(dyninp_vi.name, io_map, usage_map, node_id) graph.add_node(node_id) graph.nodes[node_id]["node"] = built_node logging.log(logging.DEBUG, f"Built node: {built_node}") node_id += 1 # build normal nodes here for onnx_node in polished_model.graph.node: built_node = build_node(onnx_node, io_map, usage_map, node_id) graph.add_node(node_id) graph.nodes[node_id]["node"] = built_node logging.log(logging.DEBUG, f"Built node: {built_node}") node_id += 1 # attach a store node for each of the model outputs for out_vi in polished_model.graph.output: built_node = build_store_node(out_vi.name, io_map, usage_map, node_id) graph.add_node(node_id) graph.nodes[node_id]["node"] = built_node logging.log(logging.DEBUG, f"Built node: {built_node}") node_id += 1 # we don't know the iteration order so we build edges here # by checking the usage map for name, info in usage_map.items(): defs = info["def"] if len(defs) > 1: logging.log(logging.ERROR, f"Multiple defn of {name} at {defs}") if len(defs) == 0: logging.log(logging.ERROR, f"{name} never defined") source = info["def"][0] for use in info["use"]: graph.add_edge(source, use, buffer=name) logging.log(logging.DEBUG, f"Added edge {source} -> {use}" f" via {name}") # we sanity check that there are no nodes that have not been connected to # the graph num_wcc = nx.number_weakly_connected_components(graph) if num_wcc > 1: wcc = nx.weakly_connected_components(graph) logging.log(logging.WARN, "Multiple components in ouput graph") for i, wc in enumerate(wcc): logging.log(logging.WARN, f"\t<{i}> {wc}") return graph
def from_torch(model: TorchBertModel, device: Optional[torch.device] = None, backend: Optional[str] = None, use_memory_opt=False): """ Args: model : a PyTorch Bert Model device : cpu or GPU backend : a string to indicates kernel provides Four options. [onnxrt-cpu, onnxrt-gpu, turbo-cpu, turbo-gpu] use_memory_opt [bool] whether or not use memory opt for variable length inputs. """ use_gpu = False if device is None: device = model.device # we may need to move to GPU explicitly if 'cuda' in device.type and torch.cuda.is_available(): model.to(device) if backend is None: backend = "turbo" # On GPU turbo is faster use_gpu = True else: if backend is None: backend = "onnxrt" # On CPU onnxrt is faster if backend == "turbo": embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) bertmodel_nopooler = BertModelNoPooler(embeddings, encoder) pooler = BertPooler.from_torch(model.pooler) return BertModel(bertmodel_nopooler, pooler, "turbo", model.config) elif backend == "onnxrt": import onnx import onnxruntime import onnxruntime.backend inputs = { 'input_ids': torch.randint(32, [2, 32], dtype=torch.long).to( device), # list of numerical ids for the tokenised text 'attention_mask': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones 'token_type_ids': torch.ones([2, 32], dtype=torch.long).to(device), # dummy list of ones } onnx_model_path = "/tmp/temp_turbo_onnx.model" with open(onnx_model_path, 'wb') as outf: torch.onnx.export( model=model, args=(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'] ), # model input (or a tuple for multiple inputs) f=outf, input_names=[ 'input_ids', 'attention_mask', 'token_type_ids' ], opset_version=11, # the ONNX version to export the model to do_constant_folding= True, # whether to execute constant folding for optimization output_names=['output'], dynamic_axes={ 'input_ids': [0, 1], 'attention_mask': [0, 1], 'token_type_ids': [0, 1] }) # num_threads = "8" # os.environ['OMP_NUM_THREADS'] = str(num_threads) # os.environ['MKL_NUM_THREADS'] = str(num_threads) onnx_model = onnx.load_model(f=onnx_model_path) onnx_model = onnxruntime.backend.prepare( model=onnx_model, device='GPU' if use_gpu else "CPU", graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) return BertModel(onnx_model, None, "onnxrt")
def _load_model(self): path = self._find_with_extension(EXTENSION) self.nodes = onnx.load_model(path).graph.node self.sess = onnxr.InferenceSession(path)
or please refer to offical site. https://github.com/onnx/onnx """ import sys import timeit import nnvm import nnvm.compiler import tvm import onnx import numpy as np from tvm.contrib import graph_runtime onnx_model = onnx.load_model(sys.argv[1]) # we can load the graph as NNVM compatible model sym, params = nnvm.frontend.from_onnx(onnx_model) x = np.ones([1, 3, 224, 224], dtype=np.float32) ###################################################################### # Compile the model on NNVM # --------------------------------------------- # We should be familiar with the process right now. #target = 'cuda' target = 'llvm' # assume first input name is data input_name = sym.list_input_names()[0] shape_dict = {input_name: x.shape}
def _impl_(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, min_seq_len: int, max_seq_len: int, num_threads: int = 1, use_gpu: bool = False, enable_mem_opt: bool = False): import multiprocessing import os temp_fn = f"/tmp/temp_{model_name}_onnx.model" if enable_random and os.path.exists(temp_fn): import transformers cfg = transformers.BertConfig() vocab_size = cfg.vocab_size else: p = multiprocessing.Pool(1) vocab_size, cfg = p.apply(generate_onnx_model, args=(model_name, use_gpu, temp_fn, seq_len, batch_size, backend, enable_random)) p.close() import contexttimer import onnxruntime.backend import onnx import numpy import json import random if not onnxruntime.backend.supports_device(backend): raise RuntimeError( f"onnxruntime does not support {backend}, recompile it!") os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['MKL_NUM_THREADS'] = str(num_threads) model = onnx.load_model(f=temp_fn) model = onnxruntime.backend.prepare( model=model, device=backend, graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) # Prepare a torch bert model to check correctness if benchmarking bert if model_name == "bert" and checkonnxrest: import transformers import torch torch.set_grad_enabled(False) torch_model = transformers.BertModel.from_pretrained( "bert-base-uncased") if enable_random: input_ids = numpy.random.randint(low=0, high=cfg.vocab_size - 1, size=(2, 17), dtype=numpy.int64) else: input_ids = numpy.random.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=numpy.int64) torch_model.eval() torch_res = torch_model(torch.tensor(input_ids)) onnx_res = model.run(inputs=[input_ids]) assert (numpy.max( numpy.abs(torch_res[0].cpu().numpy() - onnx_res[0])) < 0.01) if enable_random: request_list = [] random.seed(0) for i in range(n): generated_seq_len = random.randint(min_seq_len, max_seq_len) input_ids = numpy.random.randint(low=0, high=cfg.vocab_size - 1, size=(1, generated_seq_len), dtype=numpy.int64) request_list.append(input_ids) if enable_latency_plot: import torch print( f"dump results to onnxrt_{num_threads}_{model_name}_latency.txt" ) result_list = [] with open(f"onnxrt_{num_threads}_{model_name}_latency.txt", "w") as of: for request in request_list: if use_gpu: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: model.run(inputs=[request]) if not use_gpu: qps = n / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = n / torch_elapsed time_consume = torch_elapsed result_list.append( [len(request.flatten()), time_consume]) elapse = 0. result_list = sorted(result_list, key=lambda s: s[0]) for item in result_list: of.write(f"{item[0]}, {item[1]}\n") elapse += item[1] print(f"elapsed {elapse} QPS {n/elapse}") else: if use_gpu: start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() with contexttimer.Timer() as t: for request in request_list: model.run(inputs=[request]) if not use_gpu: qps = n / t.elapsed time_consume = t.elapsed else: end.record() torch.cuda.synchronize() torch_elapsed = start.elapsed_time(end) / 1e3 qps = n / torch_elapsed time_consume = torch_elapsed else: input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, seq_len), dtype=numpy.int64) with contexttimer.Timer() as t: for _ in range(n): model.run(inputs=[input_ids]) if enable_random: print( json.dumps({ "QPS": qps, "elapsed": time_consume, "n": n, "max_seq_len": max_seq_len, "min_seq_len": min_seq_len, "framework": f"onnx_rt_{backend}", "thread_num": num_threads, "model_name": model_name })) else: print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": f"onnx_rt_{backend}", "n_threads": num_threads, "model_name": model_name }))
1: 'sequence_length' } }, custom_opsets={"com.microsoft": 1}) print(f"ONNX model exported to {onnx_model_path}") if args.sequence_length % model.config.attention_window[0] == 0: print( f"*Attention*: You need input padding for inference: input sequece length shall be multiple of {model.config.attention_window[0]}. It is because the example input for export ONNX model does not need padding so padding logic is not in onnx model." ) # Restore Huggingface implementaiton like the following: # LongformerSelfAttention.forward = original_forward if args.precision != 'fp32' or args.optimize_onnx: from onnx import load_model from onnxruntime.transformers.onnx_model_bert import BertOnnxModel, BertOptimizationOptions model = load_model(onnx_model_path, format=None, load_external_data=True) optimization_options = BertOptimizationOptions('bert') optimizer = BertOnnxModel(model, num_heads=16, hidden_size=768) optimizer.optimize(optimization_options) optimized_model_path = model_name + "_fp32.onnx" optimizer.save_model_to_file(optimized_model_path) print(f"optimized fp32 model saved to {optimized_model_path}") if args.precision == 'fp16': optimizer.convert_model_float32_to_float16(cast_input_output=True) optimized_model_path = model_name + "_fp16.onnx" optimizer.save_model_to_file(optimized_model_path) print(f"optimized fp16 model saved to {optimized_model_path}")
def expect(self, model, args, name=None, skip_opset_version=None, skip_outvalue_version=None, custom_model_test_func=None, expected_num_initializers=None, **kwargs): """Compare model output and test runtime output. Make an ONNX model from target model with args, and put output directory. Then test runtime load the model, and compare. Arguments: model (~chainer.Chain): The target model. args (list or dict): Arguments of the target model. name (str): name of test. Set class name on default. skip_opset_version (list): Versions to skip test. skip_outvalue_version (list): Versions to skip output value check. custom_model_test_func (func): A function to check generated model. The functions is called before checking output values. ONNX model is passed to arguments. expected_num_initializers (int): The expected number of initializers in the output ONNX model. **kwargs (dict): keyward arguments for ``onnx_chainer.export``. """ test_name = name if test_name is None: test_name = self.default_name for opset_version in self.target_opsets: if skip_opset_version is not None and\ opset_version in skip_opset_version: continue dir_name = 'test_' + test_name test_path = gen_test_data_set(model, args, dir_name, opset_version, **kwargs) onnx_model_path = os.path.join(test_path, 'model.onnx') assert os.path.isfile(onnx_model_path) with open(onnx_model_path, 'rb') as f: onnx_model = onnx.load_model(f) check_all_connected_from_inputs(onnx_model) if expected_num_initializers is not None: actual_num_initializers = len(onnx_model.graph.initializer) assert expected_num_initializers == actual_num_initializers graph_input_names = _get_graph_input_names(onnx_model) if kwargs.get('input_names', {}): input_names = kwargs['input_names'] if isinstance(input_names, dict): expected_names = list(sorted(input_names.values())) else: expected_names = list(sorted(input_names)) assert list(sorted(graph_input_names)) == expected_names if kwargs.get('output_names', {}): output_names = kwargs['output_names'] if isinstance(output_names, dict): expected_names = list(sorted(output_names.values())) else: expected_names = list(sorted(output_names)) graph_output_names = [v.name for v in onnx_model.graph.output] assert list(sorted(graph_output_names)) == expected_names # Input data is generaged by `network_inputs` dict, this can # introduce unexpected conversions. Check values of input PB with # test args. if isinstance(args, (tuple, list)): flat_args = args elif isinstance(args, dict): flat_args = args.values() else: flat_args = [args] input_data = load_input_data(test_path) assert len(input_data) == len(flat_args) for i, arg in enumerate(flat_args): array = arg.array if isinstance(arg, chainer.Variable) else arg array = chainer.cuda.to_cpu(array) np.testing.assert_allclose(array, input_data[i], rtol=1e-5, atol=1e-5) if custom_model_test_func is not None: custom_model_test_func(onnx_model) if skip_outvalue_version is not None and\ opset_version in skip_outvalue_version: continue # Export function can be add unexpected inputs. Collect inputs # from ONNX model, and compare with another input list got from # test runtime. if self.check_out_values is not None: self.check_out_values(test_path, input_names=graph_input_names)
def optimize_model(input, model_type='bert', num_heads=0, hidden_size=0, optimization_options=None, opt_level=None, use_gpu=False, only_onnxruntime=False): """ Optimize Model by OnnxRuntime and/or python fusion logic. ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/resources/graph-optimizations.html). However, the coverage is limited. We also have graph fusions that implemented in Python to improve the coverage. They can combined: ONNX Runtime will run first when opt_level > 0, then graph fusions in Python will be applied. To use ONNX Runtime only and no Python fusion logic, use only_onnxruntime flag and a positive opt_level like optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True) When opt_level is None, we will choose default optimization level according to model type. When opt_level is 0 and only_onnxruntime is False, only python fusion logic is used and onnxruntime is disabled. When opt_level > 1, use_gpu shall set properly since the optimized graph might contain operators for GPU or CPU only. If your model is intended for GPU inference only (especially float16 or mixed precision model), it is recommended to set use_gpu to be True, otherwise the model is not optimized for GPU inference. For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters. Args: input (str): input model path. model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'. num_heads (int, optional): number of attention heads. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). hidden_size (int, optional): hidden size. Defaults to 0. 0 allows detect the parameter from graph automatically (for model_type "bert" only). optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None. opt_level (int, optional): onnxruntime graph optimization level (0, 1, 2 or 99) or None. Defaults to None. When the value is None, default value (1 for bert and gpt2, 0 for other model types) will be used. When the level > 0, onnxruntime will be used to optimize model first. use_gpu (bool, optional): use gpu or not for onnxruntime. Defaults to False. only_onnxruntime (bool, optional): only use onnxruntime to optimize model, and no python fusion. Defaults to False. Returns: object of an optimizer class. """ assert opt_level is None or opt_level in [0, 1, 2, 99] if model_type != "bert" and (num_heads == 0 or hidden_size == 0): logger.warning("Please specify parameters of num_heads and hidden_size when model_type is not 'bert'") (optimizer_class, producer, default_opt_level) = MODEL_TYPES[model_type] if opt_level is None: opt_level = default_opt_level temp_model_path = None if opt_level > 1: temp_model_path = optimize_by_onnxruntime(input, use_gpu=use_gpu, opt_level=opt_level) elif opt_level == 1: # basic optimizations (like constant folding and cast elimation) are not specified to exection provider. # CPU provider is used here so that there is no extra node for GPU memory copy. temp_model_path = optimize_by_onnxruntime(input, use_gpu=False, opt_level=1) if only_onnxruntime and not temp_model_path: logger.warning("Please specify a positive value for opt_level when only_onnxruntime is True") model = load_model(temp_model_path or input, format=None, load_external_data=True) if model.producer_name and producer != model.producer_name: logger.warning( f"Model producer not matched: Expect {producer}, Got {model.producer_name} {model.producer_version}. Please specify correct --model_type parameter." ) if optimization_options is None: optimization_options = FusionOptions(model_type) optimizer = optimizer_class(model, num_heads, hidden_size) if not only_onnxruntime: optimizer.optimize(optimization_options) # Remove the temporary model. if temp_model_path: os.remove(temp_model_path) logger.debug("Remove tempoary model: {}".format(temp_model_path)) optimizer.model.producer_name = "onnxruntime.transformers" from onnxruntime import __version__ as onnxruntime_version optimizer.model.producer_version = onnxruntime_version return optimizer
def __init__(self, model_path): self.model_path = model_path self.model_framework = None self.testX = None; self.testY = None; # test data self.onnx_load_model = onnx.load_model(model_path) self.model_type = None
def load(self): return onnx.load_model(self.model_path())
def export_onnx( decoder: Union[T5Decoder, T5DecoderInit], device: torch.device, onnx_model_path: str, verbose: bool = True, use_external_data_format: bool = False, use_int32_inputs: bool = False, ): """Export decoder to ONNX Args: decoder (Union[T5Decoder, T5DecoderNoPastState]): decoder object device (torch.device): device of decoder object onnx_model_path (str): onnx path verbose (bool, optional): print verbose information. Defaults to True. use_external_data_format (bool, optional): use external data format or not. Defaults to False. use_int32_inputs (bool, optional): use int32 inputs """ assert isinstance(decoder, (T5Decoder, T5DecoderInit)) inputs = T5DecoderInputs.create_dummy( decoder.config, batch_size=2, encode_sequence_length=3, past_decode_sequence_length=5 if isinstance(decoder, T5Decoder) else 0, device=device, use_int32_inputs=use_int32_inputs, ) input_list = inputs.to_list() past_names = PastKeyValuesHelper.get_past_names(decoder.config.num_layers, present=False) present_names = PastKeyValuesHelper.get_past_names(decoder.config.num_layers, present=True) present_self_names = present_names[: 2 * decoder.config.num_layers] input_past_names = past_names if isinstance(decoder, T5Decoder) else [] output_present_names = present_self_names if isinstance(decoder, T5Decoder) else present_names output_names = ["logits"] + output_present_names # Shape of input tensors (sequence_length==1): # input_ids: (batch_size, sequence_length) # encoder_attention_mask: (batch_size, encode_sequence_length) # encoder_hidden_states: (batch_size, encode_sequence_length, hidden_size) # past_self_*: (batch_size, num_heads, past_decode_sequence_length, head_size) # past_cross_*: (batch_size, num_heads, encode_sequence_length, head_size) # Shape of output tensors: # logits: (batch_size, sequence_length, vocab_size) # past_self_*: (batch_size, num_heads, past_decode_sequence_length + sequence_length, head_size) # past_cross_*: (batch_size, num_heads, encode_sequence_length, head_size) input_names = ["input_ids"] input_names.append("encoder_attention_mask") input_names.append("encoder_hidden_states") input_names.extend(input_past_names) dynamic_axes = { "input_ids": { 0: "batch_size", # 1: 'sequence_length' }, "encoder_attention_mask": {0: "batch_size", 1: "encode_sequence_length"}, "encoder_hidden_states": {0: "batch_size", 1: "encode_sequence_length"}, "logits": { 0: "batch_size", # 1: 'sequence_length' }, } for name in input_past_names: dynamic_axes[name] = { 0: "batch_size", 2: "past_decode_sequence_length" if "self" in name else "encode_sequence_length", } for name in output_present_names: if "cross" in name: dynamic_axes[name] = {0: "batch_size", 2: "encode_sequence_length"} else: # self attention past state if isinstance(decoder, T5Decoder): dynamic_axes[name] = { 0: "batch_size", 2: "past_decode_sequence_length + 1", } else: dynamic_axes[name] = { 0: "batch_size", # 2: 'sequence_length' } Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory() as tmp_dir_name: temp_onnx_model_path = os.path.join(tmp_dir_name, "decoder.onnx") Path(temp_onnx_model_path).parent.mkdir(parents=True, exist_ok=True) torch_onnx_export( decoder, args=tuple(input_list), f=temp_onnx_model_path if use_external_data_format else onnx_model_path, export_params=True, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, opset_version=12, do_constant_folding=True, use_external_data_format=use_external_data_format, verbose=verbose, ) if use_external_data_format: model = onnx.load_model(temp_onnx_model_path, load_external_data=True) OnnxModel.save( model, onnx_model_path, save_as_external_data=True, all_tensors_to_one_file=True, )
###################################################################### # Load pretrained ONNX model # --------------------------------------------- # The example super resolution model used here is exactly the same model in onnx tutorial # http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html # we skip the pytorch model construction part, and download the saved onnx model model_url = ''.join([ 'https://gist.github.com/zhreshold/', 'bcda4716699ac97ea44f791c24310193/raw/', '93672b029103648953c4e5ad3ac3aadf346a4cdc/', 'super_resolution_0.2.onnx' ]) model_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx') # now you have super_resolution.onnx on disk onnx_model = onnx.load_model(model_path) # we can load the graph as NNVM compatible model sym, params = nnvm.frontend.from_onnx(onnx_model) ###################################################################### # Load a test image # --------------------------------------------- # A single cat dominates the examples! from PIL import Image img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true' img_path = download_testdata(img_url, 'cat.png', module='data') img = Image.open(img_path).resize((224, 224)) img_ycbcr = img.convert("YCbCr") # convert to YCbCr img_y, img_cb, img_cr = img_ycbcr.split() x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
def test_qdq_extra_options_2(self): # (input) # | # Add # / | \ # MatMul MatMul MatMul # | | | # (output)(output)(output) initializers = [] input_tensor = helper.make_tensor_value_info('L', TensorProto.FLOAT, [5, 5]) output_tensor1 = helper.make_tensor_value_info('M', TensorProto.FLOAT, [5, 5]) output_tensor2 = helper.make_tensor_value_info('N', TensorProto.FLOAT, [5, 5]) output_tensor3 = helper.make_tensor_value_info('O', TensorProto.FLOAT, [5, 5]) add_weight_data = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(add_weight_data, name="P")) matmul_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_1, name="Q")) matmul_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="R")) matmul_weight_data_3 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="S")) add_node = onnx.helper.make_node('Add', ['L', 'P'], ['T'], name='Add') matmul_node_1 = onnx.helper.make_node('MatMul', ['T', 'Q'], ['M'], name='MatMul1') matmul_node_2 = onnx.helper.make_node('MatMul', ['T', 'R'], ['N'], name='MatMul2') matmul_node_3 = onnx.helper.make_node('MatMul', ['T', 'S'], ['O'], name='MatMul3') graph = helper.make_graph([add_node, matmul_node_1, matmul_node_2, matmul_node_3], 'QDQ_Test_Finetune_2', [input_tensor], [output_tensor1, output_tensor2, output_tensor3], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) test_model_path = './test_qdq_finetune_2.onnx' onnx.save(model, test_model_path) compute_range = { 'L': [0.1, 0.1], 'M': [0.1, 0.1], 'N': [0.1, 0.1], 'O': [0.1, 0.1], 'P': [0.1, 0.1], 'Q': [0.1, 0.1], 'R': [0.1, 0.1], 'S': [0.1, 0.1], 'T': [0.1, 0.1], } op_types_to_quantize = ['Add', 'MatMul'] mode = QuantizationMode.QLinearOps model = onnx.load_model(test_model_path, False) quantizer = QDQQuantizer( model, True, #per_channel False, #reduce_range mode, True, #static QuantType.QInt8, #weight_type QuantType.QInt8, #activation_type compute_range, [], #nodes_to_quantize ['Add'], #nodes_to_exclude op_types_to_quantize, {'ActivationSymmetric' : True, 'AddQDQPairToWeight' : True, 'OpTypesToExcludeOutputQuantizatioin': op_types_to_quantize, 'DedicatedQDQPair': True}) #extra_options quantizer.quantize_model() qdq_model_path = './test_qdq_finetune_qdq_2.onnx' quantizer.model.save_model_to_file(qdq_model_path, False) # Three dedicated QDQ pair should be generated and feed into each MatMul node # Also QDQ pair should not be added to Add node # QDQ pair shoud not be added to node's output for node in quantizer.model.nodes(): if node.name == 'MatMul1': self.assertTrue("T_DequantizeLinear_1" in node.input) if node.name == 'MatMul2': self.assertTrue("T_DequantizeLinear_2" in node.input) if node.name == 'MatMul3': self.assertTrue("T_DequantizeLinear_3" in node.input) if node.name == 'Add': for input in node.input: self.assertTrue("DequantizeLinear" not in input) # QDQ pair shoud not be added to MatMul's output if node.op_type == 'QuantizeLinear': self.assertTrue(node.input[0] not in ['M_QuantizeLinearInput', 'N_QuantizeLinearInput', 'O_QuantizeLinearInput'])
parser = argparse.ArgumentParser() parser.add_argument("--onnx_path", help="Path of onnx model", type=str, required=True) parser.add_argument("--batch_size", help="Batch size", type=int, default=1) parser.add_argument("--to_rasp", help="Compile to Raspberry", action='store_true') parser.add_argument("--to_local", help="Compile to local", action='store_true') args = parser.parse_args() import cvtransforms as dataset import tvm import onnx import nnvm import numpy as np model_name = args.onnx_path.split('/')[-1].split('.')[0] print("model_name = ", model_name) onnx_model = onnx.load_model( args.onnx_path ) sym, params = nnvm.frontend.from_onnx(onnx_model) import nnvm.compiler # assume first input name is data input_name = sym.list_input_names()[0] shape_dict = {input_name: (1,3,32,32)} for x in params: if params[x].shape == (): params[x] = tvm.nd.array(np.float32(0)) # Set cross-compilation target postfixs = [] targets = [] if args.to_local:
mlp_model.init_params(mx.init.Xavier()) # Save the parameters and symbol to files mlp_model.save_params(MXNET_PARAMS_PATH_DEFAULT) mlp.save(MXNET_SYMBOL_PATH_DEFAULT) # Export the ONNX specification of the model, using the parameters and symbol files onnx_mxnet.export_model(sym=MXNET_SYMBOL_PATH_DEFAULT, params=MXNET_PARAMS_PATH_DEFAULT, input_shape=[(64, input_length)], onnx_file_path=ONNX_FILE_PATH_DEFAULT) ############################################################################ ############################################################################ # Load ONNX file and remove files model = onnx.load_model(ONNX_FILE_PATH_DEFAULT) if os.path.exists(MXNET_PARAMS_PATH_DEFAULT): os.remove(MXNET_PARAMS_PATH_DEFAULT) if os.path.exists(MXNET_SYMBOL_PATH_DEFAULT): os.remove(MXNET_SYMBOL_PATH_DEFAULT) if os.path.exists(ONNX_FILE_PATH_DEFAULT): os.remove(ONNX_FILE_PATH_DEFAULT) ############################################################################ # Run the model on the task (requires an API key). run = openml.runs.run_model_on_task(model, task, avoid_duplicate_runs=False) # Publish the experiment on OpenML (optional, requires an API key). run.publish() print('URL for run: %s/run/%d' % (openml.config.server, run.run_id)) ############################################################################
def from_torch(model: TorchGPT2Model, device: Optional[torch.device] = None, backend: Optional[str] = "onnxrt"): """ Args: model : a PyTorch GPT2 Model device : cpu or GPU backend : a string to indicates kernel providers Four options. [onnxrt, turbo] """ use_gpu = False if device is None: device = model.device # may need to move to GPU explicitly if 'cuda' in device.type and torch.cuda.is_available(): model.to(device) if backend is None: backend = "onnxrt" # On GPU turbo is faster use_gpu = True else: if backend is None: backend = "onnxrt" # On CPU onnxrt is faster if backend == "turbo": raise ("Not Implemented GPT2 on Turbo Backend") if backend == "onnxrt": import onnx import onnxruntime import onnxruntime.backend # TODO(jiaruifang) Figure out the meaning of GPT2 enable_past_input = False num_layer = model.config.n_layer present_names = [f'present_{i}' for i in range(num_layer)] output_names = ["last_state"] + present_names input_names = ['input_ids'] dynamic_axes = { 'input_ids': { 0: 'batch_size', 1: 'seq_len' }, #'token_type_ids' : {0: 'batch_size', 1: 'seq_len'}, #'attention_mask' : {0: 'batch_size', 1: 'seq_len'}, 'last_state': { 0: 'batch_size', 1: 'seq_len' } } for name in present_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} inputs = { 'input_ids': torch.randint(32, [2, 32], dtype=torch.long).to(device) } if enable_past_input: past_names = [f'past_{i}' for i in range(num_layer)] input_names = [ 'input_ids' ] + past_names #+ ['token_type_ids', 'attention_mask'] dummy_past = [ torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer) ] for name in past_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} export_inputs = ( inputs['input_ids'], tuple(dummy_past) ) #, inputs['token_type_ids'], inputs['attention_mask']) else: export_inputs = (inputs['input_ids']) output_dir = './gpt2_onnx' if not os.path.exists(output_dir): os.makedirs(output_dir) onnx_model_path = os.path.join( output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input))) torch.onnx.export(model, args=export_inputs, f=onnx_model_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, opset_version=11, do_constant_folding=True, verbose=False) onnx_model = onnx.load_model(f=onnx_model_path) onnx_model = onnxruntime.backend.prepare( model=onnx_model, device='GPU' if use_gpu else 'CPU', graph_optimization_level=onnxruntime.GraphOptimizationLevel. ORT_ENABLE_ALL) return GPT2Model(onnx_model, "onnxrt")
def parse(): parser = argparse.ArgumentParser() parser.add_argument('onnx_model_path', help='The path of onnx model to be converted.') return parser.parse_args() if __name__ == '__main__': args = parse() model_path = args.onnx_model_path if not os.path.exists(args.onnx_model_path): print("Model file <{}> does not exists.".format(model_path)) else: model_onnx = onnx.load_model(model_path) # preprocess # each pixel range: [-1, 1] preprocessing_args = { 'is_bgr': False, 'red_bias': -1.0, 'green_bias': -1.0, 'blue_bias': -1.0, 'image_scale': 2.0 / 255.0 } # conversion mlmodel = convert(model_onnx, mode='regression', preprocessing_args=preprocessing_args,
if __name__ == '__main__': args = docopt(__doc__) input_file_path = args['INPUT_FILE'] if not args['OUTPUT_FILE']: output_file_path = _get_output_file_path( *os.path.splitext(input_file_path)) else: output_file_path = args['OUTPUT_FILE'] print('Converting {} to {}.'.format(input_file_path, output_file_path)) if not os.path.exists(input_file_path): sys.exit('ERROR: Provided input model path does not exists: {}'.format( input_file_path)) # convert from binary format to text format if _is_bin_file(input_file_path) and _is_txt_file(output_file_path): str_msg = _bin2txt(onnx.load_model(input_file_path)) with open(output_file_path, 'w') as f: f.write(str_msg) # convert from text format to binary format elif _is_txt_file(input_file_path) and _is_bin_file(output_file_path): with open(input_file_path, 'r') as f: converted_model = _txt2bin(f.read()) onnx.save(converted_model, output_file_path) else: sys.exit( 'ERROR: Provided input or output file has unsupported format.')
parser.add_argument('--pretrained', help='pretrained dladcnv2 model', default='./dladcnv2.onnx', type=str) parser.add_argument('--input_shape', nargs='+', default=[3, 3, 640, 640], type=int, help='input shape.') parser.add_argument('--onnx', help='onnx model', default='./dladcnv2-d3.onnx', type=str) args = parser.parse_args() model = onnx.load_model(args.pretrained) input_shape = args.input_shape d = model.graph.input[0].type.tensor_type.shape.dim rate = (input_shape[2] / d[2].dim_value, input_shape[3] / d[3].dim_value) print("rate: ", rate) d[0].dim_value = input_shape[0] #d[0].dim_param = '?' d[2].dim_value = int(d[2].dim_value * rate[0]) d[3].dim_value = int(d[3].dim_value * rate[1]) for output in model.graph.output: d = output.type.tensor_type.shape.dim d[0].dim_value = input_shape[0] #d[0].dim_param = '?' d[2].dim_value = int(d[2].dim_value * rate[0])
onnx_model = onnx.load(export_onnx_file) # load onnx model model_simp, check = simplify(onnx_model) assert check, "Simplified ONNX model could not be validated" onnx.save(model_simp, export_onnx_file) print('finished exporting onnx') # 检查输出onnx文件 test = onnx.load(export_onnx_file) onnx.checker.check_model(test) # 输出onnx的计算图 # print(onnx.helper.printable_graph(test.graph)) print("\nonnx output ==> Passed!\n") # 计算转换后的输出误差 onnx_model = onnx.load_model(export_onnx_file) # 读取onnx模型参数,构建模型 sess = ort.InferenceSession(onnx_model.SerializeToString()) # 推理模型成员初始化 sess.set_providers(['CPUExecutionProvider']) # 将模型部署至cpu input_name = sess.get_inputs()[0].name # 读取网络输入名称 output_name = sess.get_outputs()[0].name # 读取网络输出名称 onnx_output = sess.run([output_name], {input_name: x.cpu().numpy()}) # 读取数据进行onnx推理 # 计算转换误差 evalue = np.absolute(np.mean(torch_output_value - onnx_output)) print("\ntorch to onnx erro: ", evalue) # 显示网络输出及结构图 session = ort.InferenceSession(export_onnx_file) # 创建一个运行session,类似于tensorflow out_r = session.run( None, {"input": np.random.rand(1, 3, 112, 112).astype('float32')
def _assert_onnx_validity(model_path): model_proto = onnx.load_model(model_path) checker.check_graph(model_proto.graph)
import urllib urllib.urlretrieve(url, path) ###################################################################### # Load pretrained ONNX model # --------------------------------------------- # The example super resolution model used here is exactly the same model in onnx tutorial # http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html # we skip the pytorch model construction part, and download the saved onnx model model_url = ''.join(['https://gist.github.com/zhreshold/', 'bcda4716699ac97ea44f791c24310193/raw/', '93672b029103648953c4e5ad3ac3aadf346a4cdc/', 'super_resolution_0.2.onnx']) download(model_url, 'super_resolution.onnx', True) # now you have super_resolution.onnx on disk onnx_model = onnx.load_model('super_resolution.onnx') # we can load the graph as NNVM compatible model sym, params = nnvm.frontend.from_onnx(onnx_model) ###################################################################### # Load a test image # --------------------------------------------- # A single cat dominates the examples! from PIL import Image img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true' download(img_url, 'cat.png') img = Image.open('cat.png').resize((224, 224)) img_ycbcr = img.convert("YCbCr") # convert to YCbCr img_y, img_cb, img_cr = img_ycbcr.split() x = np.array(img_y)[np.newaxis, np.newaxis, :, :]