def helper(symbol, inputs, dtype, np_forward, np_backward=None): ishapes = {} input_syms = [] np_inputs = {} for (k, v) in inputs.items(): ishapes.update({k: v[0]}) np_inputs.update({k: np.random.uniform(size=v[0]).astype(dtype)}) if len(v) > 1: input_syms.append(v[1]) for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes) m = graph_runtime.create(graph, lib, ctx) m.run(**np_inputs) y_np = np_forward(**np_inputs) out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype)) np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5) # backward if np_backward: graph._set_symbol_list_attr("grad_ys", symbol) for x in input_syms: graph._set_symbol_list_attr("grad_xs", x) graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads")) graph = graph.apply("Gradient") ishapes.update({"head_grads": y_np.shape}) graph, lib, _ = nnvm.compiler.build(graph, target, ishapes) m = graph_runtime.create(graph, lib, ctx) head_grads = np.random.uniform(size=y_np.shape).astype(dtype) y_np = head_grads * np_backward(**np_inputs) m.run(head_grads=head_grads, **np_inputs) out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype)) np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
def test_concatenate_conv2d(): ch = 3 size = 8 data = sym.Variable(name="data") concat = sym.concatenate(data, data, axis=1) conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv") net = sym.elemwise_add(concat, conv) dtype="float32" dshape = (1, ch, size, size) kshape = (ch*2, ch*2, 1, 1) oshape = (1, ch*2, size, size) shape_dict = {"data": dshape} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) # data, conv weight, conv op, concat assert graph.index.num_nodes == 4 data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) m = graph_runtime.create(graph, lib, ctx) m.run(data=data, conv_weight=kernel) # get output out = m.get_output(0, tvm.nd.empty(oshape, dtype)) concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1) conv = topi.testing.conv2d_nchw_python( concat, kernel.asnumpy(), (1,1), 'SAME') ref = concat + conv tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
def test_multibox_transform_loc(): batch_size = 1 num_anchors = 3 num_classes = 3 cls_prob = sym.Variable("cls_prob") loc_preds = sym.Variable("loc_preds") anchors = sym.Variable("anchors") transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds, anchor=anchors) out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False) # Manually create test case np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]]) np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]]) np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]]) expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108], [0, 0.44999999, 1, 1, 1, 1], [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]]) dtype = "float32" for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes), "loc_preds": (batch_size, num_anchors * 4), "anchors": (1, num_anchors, 4)}) m = graph_runtime.create(graph, lib, ctx) m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)}) m.run() tvm_out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype)) tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
def _impl_v1(cls, inputs, attr, params): if 'shape' in attr: return _op.reshape(inputs[0], attr['shape']) if get_name(inputs[1]) in params: shape = tuple(params[inputs[1].name_hint].asnumpy()) out = _op.reshape(inputs[0], shape) else: # Try to infer shape by precompute prune if possible. # TODO: good to check inputs to be in params. # to be enhanced when relay support list_input_names API of NNVM logging.warning("Infering Reshape argument by precompute") func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1]) with tvm.relay.build_config(opt_level=0): graph, lib, params = tvm.relay.build(func, target="llvm", params=params) ctx = tvm.context("llvm", 0) from tvm.contrib import graph_runtime m = graph_runtime.create(graph, lib, ctx) m.set_input(**params) m.run() params_new = m.get_output(0) inputs.pop(1) out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten())) return out
def test_avg_pool2d_no_count_pad(): kh, kw = (4, 4) sh, sw = (2, 2) ph, pw = (2, 2) x = sym.Variable("x") y = sym.avg_pool2d(x, pool_size=(kh, kw), strides=(sw, sw), padding=(ph, pw), name="y", count_include_pad=False) dtype = "float32" n = 1 (ic, ih, iw) = (3, 28, 28) (oc, oh, ow) = (3, 15, 15) a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype) pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype) no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw))) pad_np[np.ix_(*no_zero)] = a_np b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype) for i in range(oh): for j in range(ow): pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3)) b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3)) / np.maximum(pad_count, 1) b_np = np.maximum(b_np, 0.0) shape_dict = {"x": (n, ic, ih, iw)} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, shape_dict) m = graph_runtime.create(graph, lib, ctx) data = tvm.nd.array(a_np) m.run(x=data) out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype)) tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
def test_forward_minimum(): a = mx.sym.var('a') b = mx.sym.var('b') dshape = (10, 20) dtype = 'float32' mx_sym = mx.sym._internal._minimum(a, b) np_a = np.random.uniform(size=dshape).astype(dtype) np_b = np.random.uniform(size=dshape).astype(dtype) mx_a = mx.nd.array(np_a) mx_b = mx.nd.array(np_b) mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b']) mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False) mod.init_params() args, auxs = mod.get_params() mx_out = mx.nd._internal._minimum(mx_a, mx_b).asnumpy() out_shape = dshape new_sym, params = frontend.from_mxnet(mx_sym, args, auxs) shape_dict = {'a': dshape, 'b': dshape} for target, ctx in ctx_list(): with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input("a", tvm.nd.array(np_a)) m.set_input("b", tvm.nd.array(np_b)) m.set_input(**params) m.run() # get outputs tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
def test_conv_ewise_injective(): x = sym.Variable("x") y = sym.conv2d(x, channels=32, kernel_size=(3, 3), groups=32, name="y", padding=(1,1)) y = sym.flatten(y + 1) + 1 dtype = "float32" dshape = (1, 32, 18, 18) kshape = (32, 1, 3, 3) oshape = (1, 32* 18 * 18) shape_dict = {"x": dshape} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, shape_dict) m = graph_runtime.create(graph, lib, ctx) # print(graph.ir(join_entry_attrs=["shape"])) assert graph.index.num_nodes == 5 # set input data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype)) m.run(x=data, y_weight=kernel, y_bias=bias) # get output out = m.get_output(0, tvm.nd.empty(oshape, dtype)) c_np = topi.testing.depthwise_conv2d_python_nchw( data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME') c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1 c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1 np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
def test_non_max_suppression(): dshape = (1, 5, 6) data = sym.Variable("data") valid_count = sym.Variable("valid_count", dtype="int32") iou_threshold = 0.7 force_suppress = True top_k = 2 out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False, iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k) np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79], [1, 0.5, 100, 60, 70, 110]]]).astype("float32") np_valid_count = np.array([4]).astype("int32") np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]]) for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)}, dtype={"data": "float32", "valid_count": "int32"}) m = graph_runtime.create(graph, lib, ctx) m.set_input(**{"data": np_data, "valid_count": np_valid_count}) m.run() tvm_out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32")) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
def test_injective_conv2d(): channels = 16 data = sym.Variable(name="data") pool = sym.global_avg_pool2d(data=data) weight = sym.reshape(pool, shape=[1, channels, 1, 1]) residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1), layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv") net = weight * data + residual size = 56 dtype="float32" dshape = (1, channels, size, size) kshape = (channels, channels, 3, 3) oshape = dshape shape_dict = {"data": dshape} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) # data, global_avg_pool, conv weight, conv op, fused elemwise add assert graph.index.num_nodes == 5 data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) m = graph_runtime.create(graph, lib, ctx) m.run(data=data, conv_weight=kernel) # get output out = m.get_output(0, tvm.nd.empty(oshape, dtype)) residual = topi.testing.conv2d_nchw_python( data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME') weight = np.mean(data.asnumpy(), axis=(2, 3)) c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
def check_verify(): mod = graph_runtime.create(graph, mhost, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal( out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
def tune_and_evaluate(tuning_opt): # extract workloads from nnvm graph print("Extract tasks...") net, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_graph(net, target=target, shape={'data': data_shape}, dtype=dtype, symbols=(nnvm.sym.conv2d,)) # run tuning tasks print("Tuning...") tune_kernels(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build( net, target=target, shape={'data': data_shape}, params=params, dtype=dtype) # upload parameters to device ctx = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) module = runtime.create(graph, lib, ctx) module.set_input('data', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run(args): onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx')) symbol, params = nnvm.frontend.from_onnx(onnx_model) input_names = symbol.list_input_names() output_names = symbol.list_output_names() test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = load_test_data(test_data_dir, input_names, output_names) inputs = dict(inputs) # assert len(input_names) == len(inputs) + len(params) # assert len(output_names) == len(outputs) graph, lib, params = compile( symbol, args.target, input_names, inputs, params, args.opt_level, args.autotvm_log) if args.dump_nnvm: print(graph.ir()) print(graph.json()) ctx = tvm.gpu() # Prepare inputs. tvm_inputs = {} for name, value in inputs.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) for name, value in params.items(): tvm_inputs[name] = tvm.nd.array(value, ctx=ctx) graph_module = None if args.debug: try: graph_module = debug_runtime.create(graph, lib, ctx) except: print('debug_runtime is disabled. ' 'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM') if graph_module is None: graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**tvm_inputs) graph_module.run() for i, (name, expected) in enumerate(outputs): tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx) actual = graph_module.get_output(i, tvm_output).asnumpy() np.testing.assert_allclose(expected, actual, rtol=1e-3, atol=1e-4), name print('%s: OK' % name) print('ALL OK') if args.iterations > 1: num_iterations = args.iterations - 1 start = time.time() for t in range(num_iterations): graph_module.run() cupy.cuda.device.Device().synchronize() elapsed = time.time() - start print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
def test_nms(): dshape = (1, 5, 6) data = sym.Variable("data") valid_count = sym.Variable("valid_count", dtype="int32") nms_threshold = 0.7 force_suppress = True nms_topk = 2 out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold, force_suppress=force_suppress, nms_topk=nms_topk) np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79], [1, 0.5, 100, 60, 70, 110]]]).astype("float32") np_valid_count = np.array([4]).astype("int32") np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79], [-1, -1, -1, -1, -1, -1]]]) target = "llvm" ctx = tvm.cpu() graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)}, dtype={"data": "float32", "valid_count": "int32"}) m = graph_runtime.create(graph, lib, ctx) m.set_input(**{"data": np_data, "valid_count": np_valid_count}) m.run() out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32")) tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
def test_gru_like(): def unit(rnn_dim): X = relay.var("X", shape=(1, rnn_dim)) W = relay.var("y", shape=(3 * rnn_dim, rnn_dim)) matmul = relay.nn.dense(X, W) splitted = relay.split(matmul, indices_or_sections=3, axis=1) out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2]) return relay.Function([X, W], out) def sigmoid(x): return 1 / (1 + np.exp(-x)) def unit_numpy(X, W): prod = np.dot(X, W.transpose()) splits = np.split(prod, indices_or_sections=3, axis=1) return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2]) dtype = "float32" rnn_dim = 1000 x = np.random.rand(1, rnn_dim).astype(dtype) y = np.random.rand(3*rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005 out_shape = (1, rnn_dim) z = unit(rnn_dim) for target, ctx in ctx_list(): with relay.build_config(opt_level=2): graph, lib, params = relay.build(z, target) m = graph_runtime.create(graph, lib, ctx) m.set_input("X", tvm.nd.array(x.astype(dtype))) m.set_input("y", tvm.nd.array(y.astype(dtype))) m.set_input(**params) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() ref = unit_numpy(x, y) tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
def test_mixed_precision(): x = sym.Variable("x") dtype = "int8" out_dtype="int32" y = sym.conv2d(x, channels=10, kernel_size=(3,3), name="y", padding=(1,1), use_bias=False, out_dtype="int32") dshape = (1, 3, 18, 18) kshape = (10, 3, 3, 3) oshape = (1, 10, 18, 18) shape_dict = {"x": dshape} dtype_dict = {"x": dtype} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict) m = graph_runtime.create(graph, lib, ctx) data = tvm.nd.array(np.random.uniform(-127, 127, size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(-127, 127, size=kshape).astype(dtype)) m.run(x=data, y_weight=kernel) out = m.get_output(0, tvm.nd.empty(oshape, out_dtype)) c_np = topi.testing.conv2d_nchw_python( data.asnumpy().astype(out_dtype), kernel.asnumpy().astype(out_dtype), 1, 1) tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
def test_forward_where(): cond = mx.sym.var('cond') x = mx.sym.var('x') y = mx.sym.var('y') dshape = (2, 2) dtype = 'float32' mx_sym = mx.sym.where(cond, x, y) np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype) np_x = np.random.uniform(size=dshape).astype(dtype) np_y = np.random.uniform(size=dshape).astype(dtype) mx_cond = mx.nd.array(np_cond) mx_x = mx.nd.array(np_x) mx_y = mx.nd.array(np_y) mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y']) mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False) mod.init_params() args, auxs = mod.get_params() mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy() out_shape = dshape new_sym, params = frontend.from_mxnet(mx_sym, args, auxs) shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape} for target, ctx in ctx_list(): with nnvm.compiler.build_config(opt_level=3): graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input("cond", tvm.nd.array(np_cond)) m.set_input("x", tvm.nd.array(np_x)) m.set_input("y", tvm.nd.array(np_y)) m.set_input(**params) m.run() # get outputs tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy() tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, tvm.target.rasp(), shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def graph_to_function(graph, target, ctx, shape=None, dtype=None): """Convert a graph to a function taking a keyword args and returning a list of results (both args and results are numpy arrays). Example:: fun = graph_to_function(graph, llvm, cpu(0)) [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,))) Parameters ---------- graph : nnvm.graph.Graph A graph we want to convert to a function. target : str or :any:`tvm.target.Target` The build target ctx : TVMContext The context to deploy the module. shape : Dict[str, Tuple[int]], optional A dict mapping input variable names to shapes. By default shapes will be inferred from variables' attributes. Note that this parameter takes precedence over variables' attributes. dtype : Dict[str, str] or str, optional A dict mapping input variable names to dtypes, or just a single dtype. By default dtypes will be inferred from variables' attributes. Note that this parameter takes precedence over variables' attributes. Returns ------- function : Callable[..., List[numpy.ndarray]] """ # Infer missing shapes and dtypes graph, shape, dtype, output_shapes, output_dtypes = \ infer_shapes_dtypes(graph, shape=shape, dtype=dtype) if None in dtype.values(): raise ValueError("Input variables with no type: {}".format(dtype)) if not all(shape.values()): raise ValueError("Input variables with no shape: {}".format(shape)) compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype) module = graph_runtime.create(compute_graph, lib, ctx) if params: module.set_inputs(**params) def run(**kwargs): module.run(**kwargs) res = [] for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)): res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy()) return res return run
def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) module = graph_runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) return out.asnumpy(), graph
def build_and_run(sym, params, data, out_shape): ctx = tvm.cpu(0) graph, lib, params = nnvm.compiler.build(sym, "llvm", shape={"data":data.shape}, params=params) module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) return out.asnumpy()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--target', type=str, required=True, choices=['cuda', 'rocm', 'opencl', 'metal'], help="Compilation target.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.") parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter ctx = tvm.context(args.target, 0) batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) if args.target == "cuda": unroll = 1400 else: unroll = 128 with nnvm.compiler.build_config(opt_level=opt_level): with tvm.build_config(auto_unroll_max_step=unroll, unroll_explicit=(args.target != "cuda")): graph, lib, params = nnvm.compiler.build( net, args.target, shape={"data": data_shape}, params=params) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") module = runtime.create(graph, lib, ctx) module.set_input(**params) module.set_input("data", data) module.run() out = module.get_output(0, tvm.nd.empty(out_shape)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(args.repeat): prof_res = ftimer() print(prof_res) # sleep for avoiding device overheat if i + 1 != args.repeat: time.sleep(45)
def check_verify(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=a) out = mod.get_output(0, tvm.nd.empty((n,))) np.testing.assert_equal(out.asnumpy(), a + 1)
def verify(graph, lib): m = graph_runtime.create(graph, lib, tvm.cpu(0)) # get member functions na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) m.run(x=na, y=nb) # get outputs out = m.get_output(0, tvm.nd.empty(shape, dtype)) tvm.testing.assert_allclose( out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
def get_tvm_output(xs, target, ctx, dtype='float32'): shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)} func, params = relay.frontend.from_keras(keras_model, shape_dict) with relay.build_module.build_config(opt_level=2): graph, lib, params = relay.build(func, target, params=params) m = graph_runtime.create(graph, lib, ctx) for name, x in zip(keras_model.input_names, xs): m.set_input(name, tvm.nd.array(x.astype(dtype))) m.set_input(**params) m.run() return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
def test_num_outputs(): x = sym.Variable('x') z = sym.split(x, indices_or_sections=5, axis=1) shape = (10, 10) dtype = tvm.float32 nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) params = {"x": nx} graph, lib, params = nnvm.compiler.build( z, "llvm", shape={"x": nx.shape}, params=params) m = graph_runtime.create(graph, lib, tvm.cpu(0)) assert m.get_num_outputs() == 5
def get_tvm_output(func, x, params, target, ctx, out_shape=(1, 1000), input_name='image', dtype='float32'): with relay.build_module.build_config(opt_level=3): graph, lib, params = relay.build(func, target, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input(input_name, tvm.nd.array(x.astype(dtype))) m.set_input(**params) m.run() # get outputs out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) return out.asnumpy()
def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding): for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(sym, target, shape_dict) m = graph_runtime.create(graph, lib, ctx) data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype)) m.run(x=data, y_weight=kernel, y_bias=bias) out = m.get_output(0, tvm.nd.empty(oshape, dtype)) c_np = topi.testing.conv2d_nchw_python( data.asnumpy(), kernel.asnumpy(), 1, padding) c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
def verify_reduce(dshape, fnp, fsym, **kwargs): x = sym.Variable("x") y = fsym(x + 1, **kwargs) dtype = "float32" for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape}) m = graph_runtime.create(graph, lib, ctx) # set input data = np.random.uniform(size=dshape).astype(dtype) out_np = fnp(data + 1, **kwargs) m.run(x=data) out = m.get_output(0, tvm.nd.empty(out_np.shape)) np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
def get_tvm_output(symbol, x, params, target, ctx, out_shape=(1, 1000), input_name='image', dtype='float32'): shape_dict = {input_name : x.shape} with nnvm.compiler.build_config(opt_level=2): graph, lib, params = nnvm.compiler.build(symbol, target, shape_dict, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input(input_name, tvm.nd.array(x.astype(dtype))) m.set_input(**params) m.run() # get outputs out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) return out.asnumpy()
def check_load_module(): temp = util.tempdir() path_lib = temp.relpath("deploy.so") mhost.export_library(path_lib) with open(temp.relpath("deploy.json"), "w") as out_file: out_file.write(graph) loaded_lib = tvm.module.load(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal( out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host) # filter out non-packed conv2d task tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def test_full(): shape = (3, 4, 5) value = 7 dtype = "float32" for target, ctx in ctx_list(): data = sym.Variable("data", dtype=dtype) # full_like s = sym.full_like(data=data, fill_value=value, name="s") graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape}) m = graph_runtime.create(graph, lib, ctx) m.run(data=np.random.uniform(size=shape).astype(dtype)) out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=value, dtype=dtype), atol=1e-5, rtol=1e-5) # ones_like s = sym.ones_like(data=data, fill_value=value, name="s") graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape}) m = graph_runtime.create(graph, lib, ctx) m.run(data=np.random.uniform(size=shape).astype(dtype)) out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=1, dtype=dtype), atol=1e-5, rtol=1e-5) # zeros_like s = sym.zeros_like(data=data, fill_value=value, name="s") graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape}) m = graph_runtime.create(graph, lib, ctx) m.run(data=np.random.uniform(size=shape).astype(dtype)) out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=0, dtype=dtype), atol=1e-5, rtol=1e-5) # full s = sym.full(shape=shape, dtype=dtype, fill_value=value, name="s") graph, lib, _ = nnvm.compiler.build(s, target) m = graph_runtime.create(graph, lib, ctx) m.run() out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=value, dtype=dtype), atol=1e-5, rtol=1e-5) # ones s = sym.ones(shape=shape, dtype=dtype, name="s") graph, lib, _ = nnvm.compiler.build(s, target) m = graph_runtime.create(graph, lib, ctx) m.run() out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=1, dtype=dtype), atol=1e-5, rtol=1e-5) # zeros s = sym.zeros(shape=shape, dtype=dtype, name="s") graph, lib, _ = nnvm.compiler.build(s, target) m = graph_runtime.create(graph, lib, ctx) m.run() out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype)) np.testing.assert_allclose(out.asnumpy(), np.full(shape, fill_value=0, dtype=dtype), atol=1e-5, rtol=1e-5)
cpudevice = tvm.runtime.cpu() ctx = tvm.runtime.context("cpu") with tvm.transform.PassContext(opt_level=3): graph_mod = relay.build(mod, tvm_targets, params=params, target_host=target_host) lib = graph_mod.get_lib() params = graph_mod.get_params() graph = graph_mod.get_json() # Create a runtime executor module module = graph_runtime.create(graph, lib, tvm.cpu()) # Feed input data module.set_input(input_tensor, tvm.nd.array(image_data)) # Feed related params module.set_input(**params) ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10) prof_res = np.array( ftimer().results) * 1000 # multiply 1000 for converting to millisecond print("%-20s %-7s %-19s (%s)" % (model_name, device, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))) print(tvm_target)
def main(): # one line to get the model block = get_model('resnet18_v1', pretrained=True) # test model img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true' img_name = 'cat.png' img_path = download_testdata(img_url, img_name, module='data') image = Image.open(img_path).resize((224, 224)) # tvm specific data path # print(img_path) x = transform_image(image) # label number to word dict prepped with synset synset_url = ''.join([ 'https://gist.githubusercontent.com/zhreshold/', '4d0b62f3d01426887599d4f7ede23ee5/raw/', '596b27d23537e5a1b5751d2b0481ef172f58b539/', 'imagenet1000_clsid_to_human.txt' ]) synset_name = 'imagenet1000_clsid_to_human.txt' synset_path = download_testdata(synset_url, synset_name, module='data') with open(synset_path) as f: synset = eval(f.read()) # print(synset) # Port GLuon model to portable computational graph batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape shape_dict = {'data': x.shape} mod, params = relay.frontend.from_mxnet(block, shape_dict) # we want a probability so add a softmax operator func = mod["main"] func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) # compile the graph to run on RaspPi modelB local_demo = False if local_demo: target = tvm.target.create('llvm') else: target = tvm.target.arm_cpu('rasp3b') with relay.build_config(opt_level=3): graph, lib, params = relay.build(func, target, params=params) # Save the library at local temporary directory. tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) # RPC server is running on the Rasp Pi. # Get the IP address of the Rasp Pi and connect to the machine to run the net compiled here with Relay. # obtain an RPC session from remote device. if local_demo: remote = rpc.LocalSession() else: # The following is my environment, change this to the IP address of your target device host = '192.168.0.10' port = 9090 remote = rpc.connect(host, port) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') # create the remote runtime module ctx = remote.cpu(0) module = runtime.create(graph, rlib, ctx) # set parameter (upload params to the remote device. This may take a while) module.set_input(**params) # set input data module.set_input('data', tvm.nd.array(x.astype('float32'))) # run module.run() # get output out = module.get_output(0) # get top1 result top1 = np.argmax(out.asnumpy()) print('TVM prediction top-1: {}'.format(synset[top1]))
def deploy_rpc(): """Runs the demo that deploys a model remotely through RPC. """ from tvm import rpc from tvm.contrib import util, emscripten # As usual, load the resnet18 model. net, params, data_shape, out_shape = load_mxnet_resnet() # Compile the model. # Note that this time we are changing the target. # This is because we want to translate the host library into JavaScript # through Emscripten. graph, lib, params = compile_net( net, target_host="llvm -target=asmjs-unknown-emscripten -system-lib", target="opengl", data_shape=data_shape, params=params) # Now we want to deploy our model through RPC. # First we ned to prepare the module files locally. print("Saving the compiled module...") temp = util.tempdir() path_obj = temp.relpath("deploy.bc") # host LLVM part path_dso = temp.relpath("deploy.js") # host JavaScript part path_gl = temp.relpath("deploy.gl") # device GLSL part path_json = temp.relpath("deploy.tvm_meta.json") lib.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) lib.imported_modules[0].save(path_gl) print("- Saved files:", temp.listdir()) # Connect to the RPC server. print("Connecting to RPC server...") proxy_host = 'localhost' proxy_port = 9090 remote = rpc.connect(proxy_host, proxy_port, key="js") print("- Connected to RPC server!") # Upload module to RPC server. print("Uploading module to RPC server...") remote.upload(path_dso, "deploy.dso") remote.upload(path_gl) remote.upload(path_json) print("- Upload completed!") # Load remote library. print("Loading remote library...") fdev = remote.load_module("deploy.gl") fhost = remote.load_module("deploy.dso") fhost.import_module(fdev) rlib = fhost print("- Remote library loaded!") ctx = remote.opengl(0) # Upload the parameters. print("Uploading parameters...") rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} print("- Parameters uploaded!") # Create the remote runtime module. print("Running remote module...") from tvm.contrib import graph_runtime module = graph_runtime.create(graph, rlib, ctx) # Set parameter. module.set_input(**rparams) # Set input data. input_data = np.random.uniform(size=data_shape) module.set_input('data', tvm.nd.array(input_data.astype('float32'))) # Run. module.run() print("- Remote module execution completed!") out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx)) # Print first 10 elements of output. print(out.asnumpy()[0][0:10])
trials = 50 compute_graph = nnvm.graph.create(output) ctx = tvm.device("cuda", 0) params = generate_random_parameters(compute_graph, "data", data_shape, with_input=True, context=ctx) input_data = params["data"] deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target="cuda", shape={"data": data_shape}, params=params) # print(deploy_graph.ir()) module = graph_runtime.create(deploy_graph, lib, ctx) # warm-up module.run(data=input_data) output = module.get_output(0, None) # print(output.asnumpy()) time_evaluator = module.module.time_evaluator("run", ctx, number=trials, repeat=10) time_cost = time_evaluator().mean * 1e3 print("time_cost=", time_cost, "ms")
###################################################################### # Build the ResNet Runtime # ------------------------ # Build the ResNet graph runtime, and configure the parameters. # Set ``device=vtacpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" # Device context ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) # Build the graph runtime graph, lib, params = generate_graph(os.path.join(data_dir, graph_fn), os.path.join(data_dir, params_fn), device) m = graph_runtime.create(graph, lib, ctx) # Set the parameters m.set_input(**params) ###################################################################### # Run ResNet-18 inference on a sample image # ----------------------------------------- # Perform image classification on test image. # You can change the test image URL to any image of your choosing. # Read in test image image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' # Read in test image response = requests.get(image_url) image = Image.open(BytesIO(response.content)).resize((224, 224))
def test_one_time(one_time_length=1000, Test_sparse=True, image_shape=(3, 32, 32)): # Hyper-parameter define batch_size = 1 num_class = 10 data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_class) sparse_kernel_shape = (batch_size, 12) dtype = "float32" data = sym.Variable("data") sparse_kernel = sym.Variable("sparse_kernel", init=np.random.randint( 0, 2, sparse_kernel_shape).astype(dtype)) if Test_sparse: y1 = sym.conv2d_sparse(data=data, sparsity=sparse_kernel, channels=12, kernel_size=(3, 3), padding=(0, 0), use_bias=False, out_layout='NCHW') else: y1 = sym.conv2d(data=data, channels=10, kernel_size=(3, 3), padding=(0, 0), use_bias=False, out_layout='NCHW') # y = sym.flatten(y1) # y = sym.dense(y, units=10, use_bias=False) # y = sym.softmax(y) out = y1 # Test Graph compilation # Once the API is well-defined, this part will be OK # g = graph.create(out) # print("-------------Starts----------------") # print(g.json()) # print("-----------------------------------") # print(g.ir()) # print("--------------Ends-----------------") # Create workload net, params = create_sparse_workload(out, batch_size, image_shape, dtype) # print("-------------Starts2---------------") # print(net.debug_str()) # print(params) # print("--------------Ends2----------------") # Test Forward # NNVM-compiler build opt_level = 0 target = tvm.target.mali() target_host = "llvm -target=aarch64-linux-gnu" with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, target=target, shape={"data": data_shape}, params=params, target_host=target_host) tmp = util.tempdir() lib_fname = tmp.relpath("net.tar") lib.export_library(lib_fname) remote = rpc.connect('59.78.6.204', 9090) remote.upload(lib_fname) rlib = remote.load_module("net.tar") ctx = remote.cl(0) # create random input real_data = np.random.uniform(-1, 1, size=data_shape).astype(dtype) real_sparse_kernel = np.array(([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])).astype(dtype) # real_sparse_kernel = np.random.randint(0, 2, sparse_kernel_shape).astype(dtype) # print(real_data) # print(real_sparse_kernel) # create module module = graph_runtime.create(graph, rlib, ctx) # set input and parameters module.set_input("data", real_data) if Test_sparse: module.set_input("sparse_kernel", real_sparse_kernel) module.set_input(**params) # run # localtime = time.asctime(time.localtime(time.time())) # print("Start time:" + localtime) starttime = time.time() for _ in range(one_time_length): module.run() endtime = time.time() # localtime = time.asctime(time.localtime(time.time())) # print("End time:" + localtime) print(endtime - starttime) # get output out = module.get_output(0) # convert to numpy out.asnumpy() # Print first 10 elements of output # print("-------------Starts3---------------") # # print(out.asnumpy().flatten()[0:10]) # print(out) # print("--------------Ends3----------------") return endtime - starttime
def main(): model = posenet.load_model(args.model) model = model.to(DEVICE).eval() output_stride = model.output_stride if args.output_dir: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) filenames = [ f.path for f in os.scandir(args.image_dir) if f.is_file() and f.path.endswith((".png", ".jpg")) ] if args.use_tvm: import tvm from tvm.contrib import graph_runtime with open(args.tvm_graph) as f: tvm_graph = f.read() tvm_lib = tvm.runtime.load_module(args.tvm_lib) with open(args.tvm_params, "rb") as f: tvm_params = bytearray(f.read()) ctx = tvm.cpu() module = graph_runtime.create(tvm_graph, tvm_lib, ctx) module.load_params(tvm_params) preprocessing_time = [] inference_time = [] processing_time = [] for filename in tqdm(filenames, desc="Processed", unit="files"): start = now() input_image, draw_image, output_scale = posenet.read_imgfile( filename, scale_factor=args.scale_factor, output_stride=output_stride, resize=(args.processing_height, args.processing_width) if args.resize else None, ) preprocessing_time.append(now() - start) start = now() with torch.no_grad(): if args.use_tvm: input_data = tvm.nd.array(input_image) module.run(**{args.input_name: input_data}) out = [] for idx in range(module.get_num_outputs()): res = ( torch.Tensor(module.get_output(idx).asnumpy()) .squeeze(0) .to(DEVICE) ) out.append(res) else: input_image = torch.Tensor(input_image).to(DEVICE) out = [] for idx, res in enumerate(model(input_image)): out.append(res.squeeze(0)) inference_time.append(now() - start) ( heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result, ) = out start = now() if args.decoder == "multi": ( pose_scores, keypoint_scores, keypoint_coords, ) = posenet.decode_multiple_poses( heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result, output_stride, max_pose_detections=10, min_pose_score=0.25, ) elif args.decoder == "single": (keypoints, pose_score, keypoint_scores) = posenet.decode_single_pose( heatmaps_result, offsets_result, output_stride ) pose_scores = np.asarray([pose_score]) keypoint_scores = np.asarray([keypoint_scores]) keypoint_coords = np.asarray([keypoints]) else: raise NotImplementedError( "The decoder {} is not implemented.".format(args.decoder) ) processing_time.append(now() - start) keypoint_coords *= output_scale if args.output_dir: draw_image = posenet.draw_skel_and_kp( draw_image, pose_scores, keypoint_scores, keypoint_coords, min_pose_score=0.25, min_part_score=0.25, ) cv2.imwrite( os.path.join( args.output_dir, os.path.relpath(filename, args.image_dir) ), draw_image, ) if args.save_keypoints: with open( os.path.join( args.output_dir, os.path.relpath(filename, args.image_dir) + ".npy", ), "wb", ) as outfile: np.save( outfile, list(zip(pose_scores, keypoint_scores, keypoint_coords)), ) if args.verbose: print("Results for image: %s" % filename) for point_idx in range(len(pose_scores)): if pose_scores[point_idx] == 0.0: break print("Pose #%d, score = %f" % (point_idx, pose_scores[point_idx])) for keypoint_idx, (score, coord) in enumerate( zip(keypoint_scores[point_idx, :], keypoint_coords[point_idx, :, :]) ): print( "Keypoint %s, score = %f, coord = %s" % (posenet.PART_NAMES[keypoint_idx], score, coord) ) avg_preprocessing_time = np.mean(preprocessing_time) avg_postprocessing_time = np.mean(processing_time) avg_inference_time = np.mean(inference_time) print("=" * 80) print( "Decoder: {}, TVM Runtime: {}, Resize to {}x{} HxW: {}".format( args.decoder, "enabled" if args.use_tvm else "disabled", args.processing_height, args.processing_width, "enabled" if args.resize else "disabled", ) ) print("-" * 80) print("Average pre-processing FPS: {:.2f}".format(1 / avg_preprocessing_time)) print("Average inference FPS: {:.2f}".format(1 / avg_inference_time)) print("Average post-processing FPS: {:.2f}".format(1 / avg_postprocessing_time)) print( "Average FPS: {:.2f}".format( 1 / (avg_postprocessing_time + avg_inference_time + avg_preprocessing_time) ) )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level target = "llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon" num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload(batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): with tvm.target.rasp(): graph, lib, params = nnvm.compiler.build( net, target, shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def tracer(module, info, is_before): pass #global timing #if bool(is_before): # timing = time.time() #else: # print('Executes: ', info.name, (time.time() - timing) * 1000) passes = [(1, tensorizer.rewrite)] with tvm.transform.PassContext(opt_level=4, trace=tracer, config={'tir.add_lower_pass': passes}): #with tvm.transform.PassContext(opt_level=4, trace=tracer): #graph, lib, params = tvm.relay.build(module, target='cuda -libs=cublas,cudnn') graph, lib, params = tvm.relay.build(module, target='nvptx') module = runtime.create(graph, lib, tvm.gpu()) x_ = (np.random.randn(n, c, h, w) * 128).astype('float32') module.set_input('x', x_) timer = module.module.time_evaluator('run', ctx=tvm.gpu(), number=1, repeat=1) timed = timer() print((n * oc * (h - kh + 1) * (w - kw + 1)) * (kh * kw * ic) / timed.mean / 1e9) print('%d us' % int(timed.mean * 1e6))
def test_vgg(): def get_feature(internel_layer, layers, filters, batch_norm=False): """ Get VGG feature body as stacks of convoltions. layers : [1, 1, 2, 2, 2] filters : [64, 128, 256, 512, 512] """ for i, num in enumerate(layers): """ i = 0, num = 1 i = 1, num = 1 i = 2, num = 2 i = 3, num = 2 i = 4, num = 2 """ for j in range(num): internel_layer = sym.pad(data=internel_layer, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) internel_layer = sym.conv2d(data=internel_layer, kernel_size=(3, 3), channels=filters[i], layout='NHWC', kernel_layout='HWOI', name="conv%s_%s" % (i + 1, j + 1)) if batch_norm: internel_layer = sym.batch_norm(data=internel_layer, axis=3, name="bn%s_%s" % (i + 1, j + 1)) internel_layer = sym.relu(data=internel_layer, name="relu%s_%s" % (i + 1, j + 1)) internel_layer = sym.max_pool2d(data=internel_layer, pool_size=(2, 2), strides=(2, 2), layout="NHWC", name="pool%s" % (i + 1)) return internel_layer def get_classifier(input_data, num_classes): """ Get VGG classifier layers as fc layers. """ flatten = sym.flatten(data=input_data, name="flatten") fc1 = sym.dense(data=flatten, units=32, name="fc1") relu1 = sym.relu(data=fc1, name="relu1") drop1 = sym.dropout(data=relu1, rate=0.5, name="drop1") fc2 = sym.dense(data=drop1, units=32, name="fc2") relu2 = sym.relu(data=fc2, name="relu2") drop2 = sym.dropout(data=relu2, rate=0.5, name="drop2") fc3 = sym.dense(data=drop2, units=num_classes, name="fc3") return fc3 def get_symbol(datas, num_classes, num_layers=11, batch_norm=False): """ Parameters ------------ num_classes : int, default 16 Number of classification classes num_layers : int Number of layers for the variant of vgg. Options are 11, 13, 16, 19 batch_norm : bool, default False Use batch normalization. """ vgg_spec = { 11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]), 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]), 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512]) } if num_layers not in vgg_spec: raise ValueError( "Invalide num_layers {}. Choices are 11, 13, 16, 19.".format( num_layers)) layers, filters = vgg_spec[num_layers] feature = get_feature(datas, layers, filters, batch_norm) classifier = get_classifier(feature, num_classes) symbol = sym.softmax(data=classifier, name="softmax") return symbol input_shape = (1, 224, 224, 16) target_host = "llvm" device = "nnpu" data = nnvm.symbol.Variable(name="data") target = tvm.target.create("llvm -device={}".format(device)) print("ok") num_runs = 1 z = get_symbol(datas=data, num_classes=16) compute_graph = nnvm.graph.create(z) print(compute_graph.ir()) with nnvm.compiler.build_config(opt_level=0): if target.device_name != "nnpu": deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) else: nnpu.set_device(nnpu.get_env(), type='SC') with ScheduleProcHelper(): with nnpu.build_config(): deploy_graph, lib, params = nnvm.compiler.build( compute_graph, target, shape={"data": input_shape}, dtype="float32", target_host=target_host) ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context( str("llvm"), 0) module = runtime.create(deploy_graph, lib, ctx) a_np = np.random.uniform(size=input_shape, low=-32, high=32).astype(np.float32) print(a_np) module.set_input(data=a_np) ftimer = module.module.time_evaluator("run", ctx, number=num_runs, repeat=1) # module.run() out = module.get_output(0, out=tvm.nd.empty((1, 16))) print(out.asnumpy) print(deploy_graph.ir()) print(ftimer().mean * 10)
def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None): in_array = np.random.uniform(size=shape).astype(dtype) if alpha == None and beta == None and bias == None: alpha = 0.0001 beta = 0.75 bias = 1.0 node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], size=nsize) else: node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], alpha=alpha, beta=beta, bias=bias, size=nsize) graph = helper.make_graph( [node], "lrn_test", inputs=[ helper.make_tensor_value_info("in", TensorProto.FLOAT, list(shape)) ], outputs=[ helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape)) ]) model = helper.make_model(graph, producer_name='lrn_test') def _get_python_lrn(): square_sum = np.zeros(shape).astype(dtype) for n, c, h, w in np.ndindex(in_array.shape): square_sum[n, c, h, w] = sum(in_array[n, max(0, c - int(math.floor((nsize - 1) / 2))): \ min(5, c + int(math.ceil((nsize - 1) / 2)) + 1), h, w] ** 2) py_out = in_array / ((bias + (alpha / nsize) * square_sum)**beta) return py_out for target, ctx in ctx_list(): new_sym, params = nnvm.frontend.from_onnx(model) input_name = model.graph.input[0].name shape_dict = {input_name: in_array.shape} dtype_dict = {input_name: dtype} graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params) m = graph_runtime.create(graph, lib, ctx) # set inputs m.set_input(input_name, tvm.nd.array(in_array.astype(dtype))) m.set_input(**params) m.run() # get outputs tvm_out = m.get_output(0, tvm.nd.empty(shape, dtype)) py_out = _get_python_lrn() tvm.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
def verify_model(model_name, input_data=[], custom_convert_map={}, ctx_list=ctx_list()): """Assert that the output of a compiled model matches with that of its baseline.""" if isinstance(model_name, str): baseline_model, baseline_input = load_model(model_name) elif isinstance(input_data, list): baseline_model = model_name baseline_input = input_data elif isinstance(input_data, torch.Tensor) or len(input_data.shape) == 0: baseline_model = model_name baseline_input = [input_data] else: assert False, "Unexpected input format" if torch.cuda.is_available(): baseline_model = baseline_model.cuda() baseline_input = [inp.cuda() for inp in baseline_input] with torch.no_grad(): baseline_outputs = baseline_model(*baseline_input) if isinstance(baseline_outputs, tuple): baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs) else: baseline_outputs = (baseline_outputs.float().cpu().numpy(), ) trace = torch.jit.trace(baseline_model, baseline_input).float().eval() if torch.cuda.is_available(): trace = trace.cuda() else: trace = trace.cpu() input_names = [ "input{}".format(idx) for idx, inp in enumerate(baseline_input) ] input_shapes = list(zip(input_names, [inp.shape for inp in baseline_input])) mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map) compiled_input = dict( zip(input_names, [inp.cpu().numpy() for inp in baseline_input])) with relay.build_config(opt_level=3): for target, ctx in ctx_list: relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params) relay_model = graph_runtime.create(relay_graph, relay_lib, ctx) relay_model.set_input(**relay_params) for name, inp in compiled_input.items(): relay_model.set_input(name, inp) relay_model.run() for i, baseline_output in enumerate(baseline_outputs): compiled_output = relay_model.get_output(i).asnumpy() assert_shapes_match(baseline_output, compiled_output) tvm.testing.assert_allclose(baseline_output, compiled_output, rtol=1e-3, atol=1e-3) del model_name del baseline_model torch.cuda.empty_cache()
def run_tvm(data, symbol_file, num_inference_images, sym, devs, label_name): debug = False import tvm from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime as debug_runtime base = './compiled/' + symbol_file.split('/')[-1].replace('.json', '') path_lib = base + '_deploy_lib.tar' path_graph = base + '_deploy_graph.json' path_params = base + '_deploy_params.params' graph = open(path_graph).read() lib = tvm.runtime.load_module(path_lib) params = bytearray(open(path_params, 'rb').read()) if debug: rt_mod = debug_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod = mx.mod.Module(symbol=sym, context=devs) mod.bind(for_training=False, data_shapes=data.provide_data) else: rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod = mx.mod.Module(symbol=sym, context=devs, label_names=[ label_name, ]) mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=data.provide_label) rt_mod.load_params(params) mod.set_params(arg_params, aux_params) counter = 0 top_1_raw = 0 top_5_raw = 0 top_1_raw_mxnet = 0 top_5_raw_mxnet = 0 if debug: data = advance_data_iter(data, 0) for batch in data: # Get the original label. correct_label = int(batch.label[0].asnumpy()[0]) rt_mod.set_input('data', batch.data[0].asnumpy()) rt_mod.run() if debug: np.set_printoptions(suppress=False) for i in rt_mod.debug_datum.get_output_tensors().keys(): print(i, rt_mod.debug_get_output(i)) return tvm_res = rt_mod.get_output(0).asnumpy() mod.forward(batch, is_train=False) mxnet_res = mod.get_outputs()[0].asnumpy() if debug: print("######## MxNet ###########") print(mxnet_res[0][0]) print("######## TVM ###########") print(tvm_res[0][0]) print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("############################") print("######## MxNet ###########") print(mxnet_res) print("######## TVM ###########") print(tvm_res) #print("######## Diff ###########") # it = np.nditer(mxnet_res, flags=['multi_index']) # while not it.finished: # print("%d <%s>" % (it[0], it.multi_index), end='\n') # it.iternext() np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=0, verbose=True) try: np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=0, verbose=True) except: np.testing.assert_allclose(mxnet_res.astype('int32'), tvm_res.astype('int32'), atol=1, verbose=True) else: tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1] mxnet_pred = np.squeeze(mxnet_res).argsort()[-5:][::-1] if correct_label == tvm_pred[0]: top_1_raw += 1 top_5_raw += 1 elif correct_label in tvm_pred: top_5_raw += 1 if correct_label == mxnet_pred[0]: top_1_raw_mxnet += 1 top_5_raw_mxnet += 1 elif correct_label in mxnet_pred: top_5_raw_mxnet += 1 counter += 1 if counter == num_inference_images: break model_name = symbol_file.split('/')[-1].replace('.json', '') top_1 = float(top_1_raw_mxnet) / float(counter) top_5 = float(top_5_raw_mxnet) / float(counter) print("Mxnet", model_name, top_1, top_5, sep='\t') top_1 = float(top_1_raw) / float(counter) top_5 = float(top_5_raw) / float(counter) print("Tvm", model_name, top_1, top_5, sep='\t')
def test_tflite_anistropic_strides(): with TempOpAttr("qnn.conv2d", "FTVMQnnLegalize", legalize_qnn_conv2d): # uint8 input data_shape = (1, 1, 3, 6) data_dtype = "uint8" kernel_shape = (1, 1, 2, 2) kernel_dtype = "uint8" ref_func, qnn_func = get_funcs( data_shape=data_shape, data_dtype=data_dtype, kernel_shape=kernel_shape, kernel_dtype=kernel_dtype, input_zero_point=127, kernel_zero_point=127, input_scale=1.0, kernel_scale=1.0, kernel_size=(2, 2), padding=(0, 0), strides=(1, 3), dilation=(1, 1), data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32", ) golden_data = np.array( ( 133, 131, 129, 125, 123, 121, 135, 133, 131, 123, 121, 119, 137, 135, 133, 121, 119, 117, ) ).reshape(data_shape) golden_data = golden_data.astype("uint8") golden_weight = np.array((129, 131, 133, 135)).reshape(kernel_shape) golden_weight = golden_weight.astype("uint8") with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() qnn_output = mod.get_output(0).asnumpy() golden_output = np.array((124, -92, 164, -132)).reshape(1, 1, 2, 2) np.testing.assert_equal(qnn_output, golden_output)
# The following is my environment, change this to the IP address of your target device host = '127.0.0.1' port = 9090 remote = rpc.connect(host, port) path = "deploy_lib.tar" remote.upload(path) remote_lib = remote.load_module(path) ctx = remote.gpu() # load the module back. loaded_graph = open("deploy_graph.json").read() loaded_params = bytearray(open("deploy_param.params", "rb").read()) module = runtime.create(loaded_graph, remote_lib, ctx) # set parameter (upload params to the remote device. This may take a while) input_name = 'input_1' input_data = tvm.nd.array(x.astype(dtype)) # module.set_input(**loaded_params) # module.set_input(input_name, tvm.nd.array(x.astype(dtype))) # module.run() module.load_params(loaded_params) module.set_input(input_name, tvm.nd.array( x.astype(dtype))) # key = input_name, value = array module.run() # get output
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_model() tasks = autotvm.task.extract_from_program(func=relay_prog, params=params, ops=(tvm.relay.op.nn.conv2d, ), target=target, target_host=env.target_host) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: print("\t{}".format(tsk)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. # return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
batchsize = 1 total_time_ms = 0 global_step = 0 config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=28, inter_op_parallelism_threads=1) # load the module back. path_lib = "./export/deploy_lib.tar" loaded_json = open("./export/deploy_graph.json").read() loaded_lib = tvm.runtime.load_module(path_lib) loaded_params = bytearray( open("./export/deploy_param.params", "rb").read()) ctx = tvm.cpu() module = graph_runtime.create(loaded_json, loaded_lib, ctx) module.load_params(loaded_params) with tf.compat.v1.Session(config=config) as sess: # saver = tf.train.import_meta_graph(os.path.join(base_path,"train_data/checkPoint/trainModel.meta")) # saver.restore(sess, tf.train.latest_checkpoint(os.path.join(base_path,"train_data/checkPoint"))) with gfile.FastGFile( os.path.join(base_path, "pb_models") + '/freeze_fp32.pb', 'rb') as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) for node in graph_def.node: print("node name is: {} \t node op is: {}".format( node.name, node.op)) sess.graph.as_default()
def graph_to_function(graph, target, ctx, shape=None, dtype=None): """Convert a graph to a function taking a keyword args and returning a list of results (both args and results are numpy arrays). Example:: fun = graph_to_function(graph, llvm, cpu(0)) [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,))) Parameters ---------- graph : nnvm.graph.Graph A graph we want to convert to a function. target : str or :any:`tvm.target.Target` The build target ctx : TVMContext The context to deploy the module. shape : Dict[str, Tuple[int]], optional A dict mapping input variable names to shapes. By default shapes will be inferred from variables' attributes. Note that this parameter takes precedence over variables' attributes. dtype : Dict[str, str] or str, optional A dict mapping input variable names to dtypes, or just a single dtype. By default dtypes will be inferred from variables' attributes. Note that this parameter takes precedence over variables' attributes. Returns ------- function : Callable[..., List[numpy.ndarray]] """ # Infer missing shapes and dtypes graph, shape, dtype, output_shapes, output_dtypes = \ infer_shapes_dtypes(graph, shape=shape, dtype=dtype) if None in dtype.values(): raise ValueError("Input variables with no type: {}".format(dtype)) if not all(shape.values()): raise ValueError("Input variables with no shape: {}".format(shape)) compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype) module = graph_runtime.create(compute_graph, lib, ctx) if params: module.set_inputs(**params) def run(**kwargs): module.run(**kwargs) res = [] for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)): res.append( module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy()) return res return run
# With RPC, you can deploy the model remotely from your host machine # to the remote device. # obtain an RPC session from remote device. if local_demo: remote = rpc.LocalSession() else: # The following is my environment, change this to the IP address of your target device host = '10.77.1.162' port = 9090 remote = rpc.connect(host, port) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') # create the remote runtime module ctx = remote.cpu(0) module = runtime.create(graph, rlib, ctx) # set parameter (upload params to the remote device. This may take a while) module.set_input(**params) # set input data module.set_input('data', tvm.nd.array(x.astype('float32'))) # run module.run() # get output out = module.get_output(0) # get top1 result top1 = np.argmax(out.asnumpy()) print('TVM prediction top-1: {}'.format(synset[top1]))
lib_name = "main.so" elif platform.system() == "Windows": lib_name = "main.dll" else: raise Exception("unknown system " + platform.system()) print("export_library main lib") lib.export_library(lib_name) # or save object file for deploy usage # lib.save(os.path.join(work_root, binary_dir, 'model.o')) print("load main lib") sysLib = tvm.runtime.load_module(lib_name) ctx = tvm.cpu(0) input_data = np.random.random(dshape).astype(np.float32) for fk in ret_mods: mg = ret_mods[fk].get_json() mp = ret_mods[fk].get_params() print("test " + fk + " ------------------------------------") module = graph_runtime.create(mg, sysLib, ctx) module.load_params(relay.save_param_dict(mp)) module.set_input("data", tvm.nd.array(input_data)) module.run() num_output = module.get_num_outputs() for idx in range(num_output): print(module.get_output(idx).shape)
fo.write(nnvm.compiler.save_param_dict(params)) print(temp.listdir()) ###################################################################### # Deploy locally to Nvidia GPU # ------------------------------ # Now we can load the module back. import numpy as np from tvm.contrib import graph_runtime loaded_lib = tvm.module.load(path_lib) loaded_json = open(temp.relpath("deploy_graph.json")).read() loaded_params = bytearray( open(temp.relpath("deploy_param.params"), "rb").read()) module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0)) module.load_params(loaded_params) input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32")) module.run(data=input_data) out = module.get_output(0, out=tvm.nd.empty(out_shape)) # Print first 10 elements of output print(out.asnumpy()[0][0:10]) ###################################################################### # Compile and Deploy the Model to Raspberry Pi Remotely with RPC # -------------------------------------------------------------- # Following the steps above, we can also compile the model for Raspberry Pi. # TVM provides rpc module to help with remote deploying. # # For demonstration, we simply start an RPC server on the same machine,
def test_tensorrt_image_classification_models(): def compile_model(graph, params, data_shapes, subgraph_backend=None, op_names=None, **kwargs): _, output_shapes = nnvm.compiler.graph_util.infer_shape( graph, **data_shapes) assert len(output_shapes) == 1 flags = kwargs if subgraph_backend is not None and op_names is not None: graph = nnvm.subgraph._partition(graph, subgraph_backend, op_names) flags = {} target = tvm.target.cuda() with nnvm.compiler.build_config(opt_level=3, **flags): graph, lib, params = nnvm.compiler.build(graph, target, shape=data_shapes, params=params) return graph, lib, params, output_shapes[0] def get_output(module, data, params, output_shape): module.set_input("data", data) module.set_input(**params) module.run() return module.get_output(0).asnumpy() out = module.get_output(0, tvm.nd.empty(output_shape)) return out.asnumpy() def copy_params(params): new_params = {} for k, v in params.items(): new_params[k] = tvm.nd.array(v) return new_params def check_trt_model(baseline_module, baseline_params, graph, params, data_shape, subgraph_backend=None, op_names=None, **kwargs): trt_graph, trt_lib, trt_params, output_shape = compile_model( graph, params, {'data': data_shape}, subgraph_backend, op_names, **kwargs) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") baseline_out = get_output(baseline_module, data, baseline_params, output_shape) trt_module = graph_runtime.create(trt_graph, trt_lib, tvm.gpu()) trt_out = get_output(trt_module, data, trt_params, output_shape) np.testing.assert_almost_equal(baseline_out, trt_out, decimal=5) workload_dict = { 'resnet': nnvm.testing.resnet.get_workload, 'inception_v3': nnvm.testing.inception_v3.get_workload, 'mobilenet': nnvm.testing.mobilenet.get_workload, 'mobilenet_v2': nnvm.testing.mobilenet_v2.get_workload, 'squeezenet': nnvm.testing.squeezenet.get_workload, 'vgg': nnvm.testing.vgg.get_workload, 'densenet': nnvm.testing.densenet.get_workload } for model_name, get_workload in workload_dict.items(): logging.info('Testing TensorRT for model %s' % model_name) flags = { 'batch_size': 1, 'image_shape': (3, 224, 224), 'num_classes': 100 } if model_name == 'inception_v3': flags['image_shape'] = (3, 299, 299) if model_name.startswith('resnet'): flags['num_layers'] = 18 data_shape = (flags['batch_size'], ) + flags['image_shape'] if model_name == 'mobilenet_v2' or model_name == 'densenet': flags.pop('image_shape') net, params = get_workload(**flags) graph_json_str = nnvm.graph.create(net).json() with nnvm.compiler.build_config(opt_level=3): baseline_graph, baseline_lib, baseline_params = nnvm.compiler.build( nnvm.graph.load_json(graph_json_str), tvm.target.cuda(), shape={'data': data_shape}, params=copy_params(params)) baseline_module = graph_runtime.create(baseline_graph, baseline_lib, tvm.gpu()) # test whole graph run using tensorrt, nnvm.compiler.build_config has graph partitioning turned on check_trt_model(baseline_module, baseline_params, nnvm.graph.load_json(graph_json_str), copy_params(params), data_shape, ext_accel='tensorrt')
def run_e2e(graph): """Running end to end example """ import json if debug_fpga_only: graph = mark_nop(graph, skip_conv_layer=(0, )) dt = time.time() m = graph_runtime.create(graph, lib, ctx) timers['execution_time_create_run_time_graph'] = (time.time() - dt) total_images = 0 correct_images_top1 = 0 correct_images_top5 = 0 # Shuffle files and pre-read JSON with accuracy to continue aggregating it # otherwise if FPGA board hangs, we can continue checking random images ... import random random.shuffle(files) if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'): x = json.load(open('aggregate-ck-timer.json')) if 'total_images' in x: total_images = x['total_images'] if 'correct_images_top1' in x: correct_images_top1 = x['correct_images_top1'] if 'correct_images_top5' in x: correct_images_top5 = x['correct_images_top5'] dt1 = time.time() for f in files: total_images += 1 print( '===============================================================================' ) print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' + f) image = Image.open(os.path.join(f)).resize((224, 224)) if image.mode != 'RGB': image = image.convert('RGB') img = transform_image(image) # set inputs m.set_input('data', tvm.nd.array(img.astype("float32"))) m.set_input(**params) # execute print('') print("run (" + str(STAT_REPEAT) + " statistical repetitions)") dt = time.time() timer = m.module.time_evaluator("run", ctx, number=STAT_REPEAT) tcost = timer() timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT # get outputs tvm_output = m.get_output(0, tvm.nd.empty((1000, ), dtype, remote.cpu(0))) top1 = np.argmax(tvm_output.asnumpy()) top5 = [] atop5 = get_top5(tvm_output.asnumpy()) print('') print('TVM prediction Top1:', top1, synset[top1]) print('') print('TVM prediction Top5:') for q in atop5: x = q[1] y = synset[x] top5.append(x) print(x, y) print('') print("Internal T-cost: %g" % tcost.mean) # Check correctness if available if len(val) > 0: top = val[os.path.basename(f)] correct_top1 = False if top == top1: correct_top1 = True correct_images_top1 += 1 print('') if correct_top1: print('Current prediction Top1: CORRECT') else: print('Current prediction Top1: INCORRECT +(' + str(top) + ')') accuracy_top1 = float(correct_images_top1) / float(total_images) print('Current accuracy Top1: ' + ('%.5f' % accuracy_top1)) correct_top5 = False if top in top5: correct_top5 = True correct_images_top5 += 1 print('') if correct_top5: print('Current prediction Top5: CORRECT') else: print('Current prediction Top5: INCORRECT +(' + str(top) + ')') accuracy_top5 = float(correct_images_top5) / float(total_images) print('Current accuracy Top5: ' + ('%.5f' % accuracy_top5)) print('') print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) + ' sec.') timers['total_images'] = total_images timers['correct_images_top1'] = correct_images_top1 timers['accuracy_top1'] = accuracy_top1 timers['correct_images_top5'] = correct_images_top5 timers['accuracy_top5'] = accuracy_top5 timers['execution_time_classify_internal'] = tcost.mean timers['execution_time'] = tcost.mean with open('tmp-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) with open('aggregate-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) sys.stdout.flush()
def run_case(dtype, image, target): # Check image import os import json import sys STAT_REPEAT = os.environ.get('STAT_REPEAT', '') if STAT_REPEAT == '' or STAT_REPEAT == None: STAT_REPEAT = 10 STAT_REPEAT = int(STAT_REPEAT) # FGG: set model files via CK env CATEG_FILE = '../synset.txt' synset = eval(open(os.path.join(CATEG_FILE)).read()) files = [] val = {} if image != None and image != '': files = [image] else: ipath = os.environ.get('CK_ENV_DATASET_IMAGENET_VAL', '') if ipath == '': print('Error: path to ImageNet dataset is not set!') exit(1) if not os.path.isdir(ipath): print('Error: path to ImageNet dataset was not found!') exit(1) # get all files d = os.listdir(ipath) for x in d: x1 = x.lower() if x1.startswith('ilsvrc2012_val_'): files.append(os.path.join(ipath, x)) files = sorted(files) STAT_REPEAT = 1 # Get correct labels ival = os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT', '') fval = open(ival).read().split('\n') val = {} for x in fval: x = x.strip() if x != '': y = x.split(' ') val[y[0]] = int(y[1]) # FGG: set timers import time timers = {} # Get first shape (expect that will be the same for all) dt = time.time() image = Image.open(os.path.join(files[0])).resize((224, 224)) if image.mode != 'RGB': image = image.convert('RGB') timers['execution_time_load_image'] = time.time() - dt dt = time.time() img = transform_image(image) timers['execution_time_transform_image'] = time.time() - dt # load model from mxnet.gluon.model_zoo.vision import get_model from mxnet.gluon.utils import download model_path = os.environ['CK_ENV_MODEL_MXNET'] model_id = os.environ['MXNET_MODEL_ID'] block = get_model(model_id, pretrained=True, root=model_path) # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon net, params = nnvm.frontend.from_mxnet(block) # we want a probability so add a softmax operator net = nnvm.sym.softmax(net) # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3) if dtype != 'float32': params = { k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items() } # compile if target == None or target == 'cpu': xtarget = 'llvm' elif target == 'cuda': xtarget = 'cuda' opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, target=xtarget, shape={"data": data_shape}, params=params, dtype=dtype, target_host=None) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) if target == None or target == 'cpu': ctx = tvm.cpu(0) elif target == 'cuda': ctx = tvm.gpu(0) rlib = lib rparams = params # create graph runtime dt = time.time() module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) timers['execution_time_create_run_time_graph'] = (time.time() - dt) total_images = 0 correct_images_top1 = 0 correct_images_top5 = 0 # Shuffle files and pre-read JSON with accuracy to continue aggregating it # otherwise if FPGA board hangs, we can continue checking random images ... import random random.shuffle(files) if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'): x = json.load(open('aggregate-ck-timer.json')) if 'total_images' in x: total_images = x['total_images'] if 'correct_images_top1' in x: correct_images_top1 = x['correct_images_top1'] if 'correct_images_top5' in x: correct_images_top5 = x['correct_images_top5'] dt1 = time.time() for f in files: total_images += 1 print( '===============================================================================' ) print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' + f) image = Image.open(os.path.join(f)).resize((224, 224)) if image.mode != 'RGB': image = image.convert('RGB') img = transform_image(image) # set inputs module.set_input('data', tvm.nd.array(img.astype(dtype))) module.set_input(**rparams) # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, 1) warm_up_timer() # execute print('') print("run (" + str(STAT_REPEAT) + " statistical repetitions)") dt = time.time() timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT) tcost = timer() timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT # get outputs tvm_output = module.get_output(0, tvm.nd.empty((1000, ), dtype, ctx)) top1 = np.argmax(tvm_output.asnumpy()) top5 = [] atop5 = get_top5(tvm_output.asnumpy()) print('') print('TVM prediction Top1:', top1, synset[top1]) print('') print('TVM prediction Top5:') for q in atop5: x = q[1] y = synset[x] top5.append(x) print(x, y) print('') print("Internal T-cost: %g" % tcost.mean) # Check correctness if available if len(val) > 0: top = val[os.path.basename(f)] correct_top1 = False if top == top1: correct_top1 = True correct_images_top1 += 1 print('') if correct_top1: print('Current prediction Top1: CORRECT') else: print('Current prediction Top1: INCORRECT +(' + str(top) + ')') accuracy_top1 = float(correct_images_top1) / float(total_images) print('Current accuracy Top1: ' + ('%.5f' % accuracy_top1)) correct_top5 = False if top in top5: correct_top5 = True correct_images_top5 += 1 print('') if correct_top5: print('Current prediction Top5: CORRECT') else: print('Current prediction Top5: INCORRECT +(' + str(top) + ')') accuracy_top5 = float(correct_images_top5) / float(total_images) print('Current accuracy Top5: ' + ('%.5f' % accuracy_top5)) print('') print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) + ' sec.') timers['total_images'] = total_images timers['correct_images_top1'] = correct_images_top1 timers['accuracy_top1'] = accuracy_top1 timers['correct_images_top5'] = correct_images_top5 timers['accuracy_top5'] = accuracy_top5 timers['execution_time_classify_internal'] = tcost.mean timers['execution_time'] = tcost.mean with open('tmp-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) with open('aggregate-ck-timer.json', 'w') as ftimers: json.dump(timers, ftimers, indent=2) sys.stdout.flush() return
def run_unpropagatable_graph(dev, tgt): R""" The network is as following: a b c d \ / \ / add mul \ / subtract """ a = relay.var("a", shape=(10, 10)) b = relay.var("b", shape=(10, 10)) c = relay.var("c", shape=(10, 10)) d = relay.var("d", shape=(10, 10)) a_data = np.random.rand(10, 10).astype('float32') b_data = np.random.rand(10, 10).astype('float32') c_data = np.random.rand(10, 10).astype('float32') d_data = np.random.rand(10, 10).astype('float32') tmp_add = a_data + b_data tmp_mul = np.multiply(c_data, d_data) ref_res = np.subtract(tmp_add, tmp_mul) fallback_device = tvm.context("cpu") target = {"cpu": "llvm", dev: tgt} cpu_ctx = fallback_device dev_ctx = tvm.context(dev) def annotated(): add = relay.add(a, b) _add = relay.annotation.on_device(add, dev_ctx) mul = relay.multiply(c, d) _mul = relay.annotation.on_device(mul, cpu_ctx) sub = relay.subtract(_add, _mul) _sub = relay.annotation.on_device(sub, dev_ctx) func = relay.Function([a, b, c, d], _sub) func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_ctx.device_type)) return func def expected(): add = relay.add(a, b) mul = relay.multiply(c, d) copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx) sub = relay.subtract(add, copy_mul_sub) func = relay.Function([a, b, c, d], sub) return func annotated_func = annotated() expected_func = expected() expected_index = [2, 2, 2, 1, 1, 1, 2, 2] check_annotated_graph(annotated_func, expected_func) params = {"a": a_data, "b": b_data, "c": c_data, "d": d_data} with tvm.transform.PassContext( opt_level=0, config={"relay.fallback_device_type": fallback_device.device_type}): graph, lib, params = relay.build(annotated_func, target, params=params) contexts = [tvm.cpu(0), tvm.context(dev)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
image = Image.open(img_name).resize((224, 224)) def transform_image(image): image = np.array(image) - np.array([123., 117., 104.]) image /= np.array([58.395, 57.12, 57.375]) image = image.transpose((2, 0, 1)) image = image[np.newaxis, :] return image x = transform_image(image) ctx = tvm.cpu() loaded_graph = open("deploy_graph.json").read() loaded_lib = tvm.module.load("./net.tar") loaded_params = bytearray(open("deploy_param.params", "rb").read()) input_data = tvm.nd.array(x.astype('float32')) # create the remote runtime module module = runtime.create(loaded_graph, loaded_lib, ctx) # set parameter (upload params to the remote device. This may take a while) module.load_params(loaded_params) # run module.run(data=input_data) # get output out = module.get_output(0) # get top1 result top1 = np.argmax(out.asnumpy()) print('TVM prediction top-1: {}'.format(synset[top1]))
def main(): # extract workloads from relay program input_shape = (1, 3, 224, 224) print("Extrack tasks...") mod, params = get_workload(image_shape=input_shape[1:], batch_size=input_shape[0]) tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host, params=params, ops=( relay.op.nn.conv2d, relay.op.nn.dense, )) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_option) with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=0): graph, lib, params = relay.build_module.build( mod, target=target, params=params, target_host=target_host) tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9192, timeout=10000) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) ctx = remote.context(str(target), 0) module = runtime.create(graph, rlib, ctx) data_tvm = tvm.nd.array( (np.random.uniform(size=input_shape)).astype(dtype)) print("Run...") print("Set_input(\"data\")") module.set_input('data', data_tvm) print("Set_input(**param)") module.set_input(**params) #evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 #print(ftimer().results) tmp = sorted(ftimer().results) print(tmp[0]) print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
def run_case(model, dtype): # load model if model == 'vgg16': net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'resnet18': net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape, dtype=dtype) else: raise ValueError('no benchmark prepared for {}.'.format(model)) # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=args.target_host) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) if args.host is not None: remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cl(0) rlib = remote.load_module('net.tar') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} else: ctx = tvm.cl(0) rlib = lib rparams = params # create graph runtime module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) # benchmark # print("============================================================") # print("model: %s, dtype: %s" % (model, dtype)) # the num of runs for warm up and test num_warmup = 10 num_test = 60 if model == 'mobilenet': # mobilenet is fast, need more runs for stable measureament num_warmup *= 5 num_test *= 5 # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup) warm_up_timer() # test # print("test..") ftimer = module.module.time_evaluator("run", ctx, num_test) prof_res = ftimer() # print("cost per image: %.4fs" % prof_res.mean) print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" % (model, dtype, prof_res.mean))