def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def test_rpc_simple(): if not tvm.module.enabled("rpc"): return @tvm.register_func("rpc.test.addone") def addone(x): return x + 1 @tvm.register_func("rpc.test.strcat") def strcat(name, x): return "%s:%d" % (name, x) @tvm.register_func("rpc.test.except") def remotethrow(name): raise ValueError("%s" % name) server = rpc.Server("localhost") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.addone") assert f1(10) == 11 f3 = client.get_function("rpc.test.except") try: f3("abc") assert False except tvm.TVMError as e: assert "abc" in str(e) f2 = client.get_function("rpc.test.strcat") assert f2("abc", 11) == "abc:11"
def test_rpc_array(): if not tvm.module.enabled("rpc"): return # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) remote = rpc.connect(proxy_host, proxy_port, key="js") target = "llvm -target=asmjs-unknown-emscripten -system-lib" def check_remote(): if not tvm.module.enabled(target): print("Skip because %s is not enabled" % target) return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], target, name="myadd") path_obj = temp.relpath("dev_lib.bc") path_dso = temp.relpath("dev_lib.js") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) # Upload to suffix as dso so it can be loaded remotely remote.upload(path_dso, "dev_lib.dso") data = remote.download("dev_lib.dso") f1 = remote.load_module("dev_lib.dso") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def test_rpc_array(): if not tvm.module.enabled("rpc"): return # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) remote = rpc.connect(proxy_host, proxy_port, key="js") target = "llvm -target=asmjs-unknown-emscripten -system-lib" def check_remote(): if not tvm.module.enabled(target): print("Skip because %s is not enabled" % target) return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], target, name="myadd") path_obj = temp.relpath("dev_lib.bc") path_dso = temp.relpath("dev_lib.js") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) # Upload to suffix as dso so it can be loaded remotely remote.upload(path_dso, "dev_lib.dso") data = remote.download("dev_lib.dso") f1 = remote.load_module("dev_lib.dso") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def build_arm(): target = "llvm -target=armv7-none-linux-gnueabihf" if not tvm.module.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get('TVM_RPC_ARM_HOST', None) remote = None if host: port = int(os.environ['TVM_RPC_ARM_PORT']) try: remote = rpc.connect(host, port) except tvm.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..")
def test_bigendian_rpc(): """Test big endian rpc when there is a PowerPC RPC server available""" host = os.environ.get("TVM_POWERPC_TEST_HOST", None) port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090) if host is None: return def verify_rpc(remote, target, shape, dtype): A = tvm.placeholder(shape, dtype=dtype) B = tvm.compute(A.shape, lambda i: A[i] + tvm.const(1, A.dtype)) s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], target, name="myadd") ctx = remote.cpu(0) a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") f.save(path_dso) remote.upload(path_dso) f = remote.load_module("dev_lib.o") f(a, b) np.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy()) print("Test RPC connection to PowerPC...") remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: verify_rpc(remote, target, (10, ), dtype)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size,) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload( batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build( net, tvm.target.rasp(), shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def test_rpc_file_exchange(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) blob = bytearray(np.random.randint(0, 10, size=(10))) remote.upload(blob, "dat.bin") rev = remote.download("dat.bin") assert(rev == blob)
def test_rpc_return_func(): @tvm.register_func("rpc.test.remote_func") def addone(x): return lambda y: x+y server = rpc.Server("localhost") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.remote_func") fadd = f1(10) assert fadd(12) == 22
def try_remote_save_load(): if not tvm.module.enabled("rpc"): return if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return # Build the module. n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") s = tvm.create_schedule(C.op) s[C].opengl() target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd") remote = rpc.connect(proxy_host, proxy_port, key="js") temp = util.tempdir() ctx = remote.opengl(0) path_obj = temp.relpath("myadd.bc") path_dso = temp.relpath("myadd.js") path_gl = temp.relpath("myadd.gl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) f.imported_modules[0].save(path_gl) remote.upload(path_dso, "myadd.dso") remote.upload(path_gl) remote.upload(path_json) remote.download("myadd.dso") remote.download("myadd.gl") remote.download("myadd.tvm_meta.json") print('Loading myadd.dso') fhost = remote.load_module("myadd.dso") print('Loading myadd.gl') fdev = remote.load_module("myadd.gl") print('import_module') fhost.import_module(fdev) print('running...') a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx) c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx) fhost(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_rpc_file_exchange(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) blob = bytearray(np.random.randint(0, 10, size=(10))) remote.upload(blob, "dat.bin") rev = remote.download("dat.bin") assert (rev == blob)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, options=['-quiet'], libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def try_remote_save_load(): if not tvm.module.enabled("rpc"): return if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return # Build the module. n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") s = tvm.create_schedule(C.op) s[C].opengl() target_host = "llvm -target=asmjs-unknown-emscripten -system-lib" f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd") remote = rpc.connect(proxy_host, proxy_port, key="js") temp = util.tempdir() ctx = remote.opengl(0) path_obj = temp.relpath("myadd.bc") path_dso = temp.relpath("myadd.js") path_gl = temp.relpath("myadd.gl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) f.imported_modules[0].save(path_gl) remote.upload(path_dso, "myadd.dso") remote.upload(path_gl) remote.upload(path_json) remote.download("myadd.dso") remote.download("myadd.gl") remote.download("myadd.tvm_meta.json") print('Loading myadd.dso') fhost = remote.load_module("myadd.dso") print('Loading myadd.gl') fdev = remote.load_module("myadd.gl") print('import_module') fhost.import_module(fdev) print('running...') a = tvm.nd.array(np.random.uniform(size=16).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx) c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx) fhost(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.so") f.export_library(path_dso1, ndk.create_shared) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.so") f.export_library(path_dso2, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) print('Run GPU test ...') ctx = remote.cl(0) remote.upload(path_dso1) f1 = remote.load_module("dev_lib.so") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) print('Run CPU test ...') ctx = remote.cpu(0) remote.upload(path_dso2) f2 = remote.load_module("cpu_lib.so") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_rpc_array(): if not tvm.module.enabled("rpc"): return x = np.random.randint(0, 10, size=(3, 4)) @tvm.register_func("rpc.test.remote_array_func") def remote_array_func(y): np.testing.assert_equal(y.asnumpy(), x) server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) r_cpu = tvm.nd.array(x, remote.cpu(0)) assert str(r_cpu.context).startswith("remote") np.testing.assert_equal(r_cpu.asnumpy(), x) fremote = remote.get_function("rpc.test.remote_array_func") fremote(r_cpu)
def test_rpc_array(): if not tvm.module.enabled("rpc"): return x = np.random.randint(0, 10, size=(3, 4)) @tvm.register_func("rpc.test.remote_array_func") def remote_array_func(y): np.testing.assert_equal(y.asnumpy(), x) server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) print("second connect") r_cpu = tvm.nd.array(x, remote.cpu(0)) assert str(r_cpu.context).startswith("remote") np.testing.assert_equal(r_cpu.asnumpy(), x) fremote = remote.get_function("rpc.test.remote_array_func") fremote(r_cpu)
def mobileNet_rpc_module(dtype): # load model net, params = nnvm.testing.mobilenet.get_workload(batch_size=1, image_shape=image_shape, dtype=dtype) # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=target) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.so') lib.export_library(lib_fname, ndk.create_shared) remote = rpc.connect(proxy_host, proxy_port, key=key) remote.upload(lib_fname) ctx = remote.cl(0) rlib = remote.load_module('net.so') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} print('Run GPU test ...') # create graph runtime module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) # the num of runs for warm up and test num_warmup = 50 num_test = 300 warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup) warm_up_timer() ftimer = module.module.time_evaluator("run", ctx, num_test) prof_res = ftimer() print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" % ("mobileNet", dtype, prof_res.mean))
def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") mod = graph_runtime.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n,), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return mlib = tvm.build(s, [A, B], "llvm", name="myadd") server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = util.tempdir() ctx = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") mod = graph_runtime.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n, )).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) out = tvm.nd.empty((n, ), ctx=ctx) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1)
def check(): if not tvm.module.enabled("rpc"): return @tvm.register_func("rpc.test2.addone") def addone(x): return x + 1 @tvm.register_func("rpc.test2.strcat") def addone(name, x): return "%s:%d" % (name, x) server = multiprocessing.Process( target=rpc_proxy.websocket_proxy_server, args=("ws://localhost:%d/ws" % web_port,"x1")) # Need to make sure that the connection start after proxy comes up time.sleep(0.1) server.deamon = True server.start() client = rpc.connect(prox.host, prox.port, key="x1") f1 = client.get_function("rpc.test2.addone") assert f1(10) == 11 f2 = client.get_function("rpc.test2.strcat") assert f2("abc", 11) == "abc:11"
def test_rpc_executor(): host = "localhost" port = 9100 server = rpc.Server(host, port, use_popen=True) x = sym.Variable("x") y = sym.Variable("y") z = sym.exp(y + x) shape = (10, 128) dtype = tvm.float32 shape_dict = {"x": shape, "y": shape} tmp = util.tempdir() lib_name = tmp.relpath("net.o") graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict) # save module lib.save(lib_name) remote = rpc.connect(host, port) remote.upload(lib_name) ctx = remote.cpu(0) # load remote rlib = remote.load_module("net.o") # Create remotemodule m = graph_runtime.create(graph, rlib, remote.cpu(0)) # get member functions set_input, run, get_output = m["set_input"], m["run"], m["get_output"] na = tvm.nd.array(np.ones(shape).astype(dtype), ctx) nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx) # set inputs set_input("x", na) set_input("y", nb) # execute run() # get outputs out = tvm.nd.empty(shape, dtype, ctx) get_output(0, out) np.testing.assert_allclose( out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy())) server.terminate()
def test_rpc_executor(): host = "localhost" port = 9091 server = rpc.Server(host, port) x = sym.Variable("x") y = sym.Variable("y") z = sym.exp(y + x) shape = (10, 128) dtype = tvm.float32 shape_dict = {"x": shape, "y": shape} tmp = util.tempdir() lib_name = tmp.relpath("net.o") graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict) # save module lib.save(lib_name) remote = rpc.connect(host, port) remote.upload(lib_name) ctx = remote.cpu(0) # load remote rlib = remote.load_module("net.o") # Create remotemodule m = graph_runtime.create(graph, rlib, remote.cpu(0)) # get member functions set_input, run, get_output = m["set_input"], m["run"], m["get_output"] na = tvm.nd.array(np.ones(shape).astype(dtype), ctx) nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx) # set inputs set_input("x", na) set_input("y", nb) # execute run() # get outputs out = tvm.nd.empty(shape, dtype, ctx) get_output(0, out) np.testing.assert_allclose(out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy())) server.terminate()
def measure_peak_all(target, target_host, host, port): """measure memory bandwidth and peak compute for gpu devices Parameters ---------- target: str or :any:`tvm.target.Target` target_host: str host: str port: int """ target = tvm.target.create(target) remote = rpc.connect(host, port) n_times = 20 bandwidth_total_item = 1 << 25 bandwidth_item_per_thread = 32 compute_total_item = 1 << 21 compute_item_per_thread = 4096 if str(target).startswith("opencl"): ctx = remote.cl() elif str(target).startswith("cuda"): ctx = remote.gpu() elif str(target).startswith("metal"): ctx = remote.metal() else: raise RuntimeError("Unsupported target") logging.info("========== measure memory bandwidth ==========") measure_bandwidth_all_types(bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, ctx) logging.info("========== measure peak compute ==========") measure_compute_all_types(compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, ctx)
def check(): if not tvm.module.enabled("rpc"): return @tvm.register_func("rpc.test2.addone") def addone(x): return x + 1 @tvm.register_func("rpc.test2.strcat") def addone(name, x): return "%s:%d" % (name, x) server = multiprocessing.Process( target=proxy.websocket_proxy_server, args=("ws://localhost:%d/ws" % web_port, "x1")) # Need to make sure that the connection start after proxy comes up time.sleep(0.1) server.deamon = True server.start() client = rpc.connect(prox.host, prox.port, key="x1") f1 = client.get_function("rpc.test2.addone") assert f1(10) == 11 f2 = client.get_function("rpc.test2.strcat") assert f2("abc", 11) == "abc:11"
def test_bigendian_rpc_param(): """Test big endian rpc when there is a PowerPC RPC server available""" host = os.environ.get("TVM_POWERPC_TEST_HOST", None) port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090) if host is None: return def verify_nnvm(remote, target, shape, dtype): x = nnvm.sym.Variable("x") y = x + 1 graph, lib, _ = nnvm.compiler.build(y, target, shape={"x": shape}, dtype={"x": dtype}) temp = util.tempdir() path_dso = temp.relpath("dev_lib.o") lib.save(path_dso) remote.upload(path_dso) lib = remote.load_module("dev_lib.o") a = np.random.randint(0, 256, size=shape).astype(dtype) a[:] = 1 params = {"x": a} ctx = remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # uses save param_dict m.load_params(nnvm.compiler.save_param_dict(params)) m.run() out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) np.testing.assert_allclose(a + 1, out.asnumpy()) print("Test RPC connection to PowerPC...") remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: verify_nnvm(remote, target, (10, ), dtype)
from tvm.contrib import rpc REMOTE = rpc.connect("0.0.0.0", 9090, key="android")
input_name = 'input_0' data_shape = img.shape out_shape = (1,125,n//32,n//32) # GET model from frameworks # change xyz to supported framework name. sym, params = nnvm.frontend.from_onnx(onnx_graph) #print(sym.debug_str() # connect to the proxy # Set to be address of tvm proxy. proxy_host = os.environ["TVM_ANDROID_RPC_PROXY_HOST"] proxy_port = 9090 key = "android" print('RPC Connecting...') remote = rpc.connect(proxy_host, proxy_port, key=key) print('RPC Connected') arch = "arm64" if exec_gpu: # Mobile GPU target = 'opencl' target_host = "llvm -target=%s-linux-android" % arch ctx = remote.cl(0) else: # Mobile CPU target = "llvm -target=%s-linux-android" % arch target_host = None ctx = remote.cpu(0) print('Build Graph...')
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert (bn <= N) assert (num_thread * num_thread * 16 <= N) assert (num_block * num_block * 2 <= N) A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='Btmp') k = tvm.reduce_axis((0, N), name='k') packedB = tvm.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name='B') C = tvm.compute((N, N), lambda ii, jj: tvm.sum( A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = tvm.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis("threadIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
# library and the new parameter, since we do some optimization that will # change the parameters but keep the result of model as the same. # Save the library at local temporary directory. tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) ###################################################################### # Deploy the Model Remotely by RPC # -------------------------------- # With RPC, you can deploy the model remotely from your host machine # to the remote device. # connect the server remote = rpc.connect(host, port) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module('net.tar') ctx = remote.cl(0) # upload the parameter rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} # create the remote runtime module module = runtime.create(graph, rlib, ctx) # set parameter module.set_input(**rparams) # set input data module.set_input('data', tvm.nd.array(x.astype('float32')))
graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params, target_host=target_host) # Save the library at local temporary directory. tmp = util.tempdir() path_o = tmp.relpath('sym.o') path_cl = tmp.relpath('sym.cl') path_json = tmp.relpath('sym.tvm_meta.json') lib.save(path_o) lib.imported_modules[0].save(path_cl) # connect the server remote = rpc.connect('192.168.1.14', 9090) # upload the library to remote device and load it remote.upload(path_o) remote.upload(path_cl) remote.upload(path_json) fhost = remote.load_module('sym.o') fdev = remote.load_module('sym.cl') fhost.import_module(fdev) from tvm.contrib import graph_runtime ctx = remote.cl(0) # upload the parameter rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} dtype = 'float32'
def run_case(model, dtype): # load model if model == 'vgg16': net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'resnet18': net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=1, image_shape=image_shape, dtype=dtype) elif model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape, dtype=dtype) else: raise ValueError('no benchmark prepared for {}.'.format(model)) # compile opt_level = 2 if dtype == 'float32' else 1 with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(net, tvm.target.mali(), shape={"data": data_shape}, params=params, dtype=dtype, target_host=args.target_host) # upload model to remote device tmp = util.tempdir() lib_fname = tmp.relpath('net.tar') lib.export_library(lib_fname) if args.host is not None: remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cl(0) rlib = remote.load_module('net.tar') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} else: ctx = tvm.cl(0) rlib = lib rparams = params # create graph runtime module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype))) module.set_input(**rparams) # benchmark # print("============================================================") # print("model: %s, dtype: %s" % (model, dtype)) # the num of runs for warm up and test num_warmup = 10 num_test = 60 if model == 'mobilenet': # mobilenet is fast, need more runs for stable measureament num_warmup *= 5 num_test *= 5 # perform some warm up runs # print("warm up..") warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup) warm_up_timer() # test # print("test..") ftimer = module.module.time_evaluator("run", ctx, num_test) prof_res = ftimer() # print("cost per image: %.4fs" % prof_res.mean) print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" % (model, dtype, prof_res.mean))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, required=True, choices=['resnet', 'mobilenet'], help="The model type.") parser.add_argument('--host', type=str, required=True, help="The host address of your Raspberry Pi.") parser.add_argument('--port', type=int, required=True, help="The port number of your Raspberry Pi.") parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.") parser.add_argument('--num-iter', type=int, default=50, help="Number of iteration during benchmark.") args = parser.parse_args() opt_level = args.opt_level target = "llvm -target=armv7l-none-linux-anueabihf -mcpu=cortex-a53 -mattr=+neon" num_iter = args.num_iter batch_size = 1 num_classes = 1000 image_shape = (3, 224, 224) data_shape = (batch_size, ) + image_shape out_shape = (batch_size, num_classes) if args.model == 'resnet': net, params = nnvm.testing.resnet.get_workload(batch_size=1, image_shape=image_shape) elif args.model == 'mobilenet': net, params = nnvm.testing.mobilenet.get_workload( batch_size=1, image_shape=image_shape) else: raise ValueError('no benchmark prepared for {}.'.format(args.model)) with nnvm.compiler.build_config(opt_level=opt_level): with tvm.target.rasp(): graph, lib, params = nnvm.compiler.build( net, target, shape={"data": data_shape}, params=params) tmp = util.tempdir() lib_fname = tmp.relpath('net.o') lib.save(lib_fname) remote = rpc.connect(args.host, args.port) remote.upload(lib_fname) ctx = remote.cpu(0) rlib = remote.load_module('net.o') rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} module = runtime.create(graph, rlib, ctx) module.set_input( 'data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype("float32"))) module.set_input(**rparams) module.run() out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx)) out.asnumpy() print('benchmark args: {}'.format(args)) ftimer = module.module.time_evaluator("run", ctx, num_iter) for i in range(3): prof_res = ftimer() print(prof_res) # sleep for avoiding cpu overheat time.sleep(45)
def deploy_rpc(): """Runs the demo that deploys a model remotely through RPC. """ from tvm.contrib import rpc, util, emscripten # As usual, load the resnet18 model. net, params, data_shape, out_shape = load_mxnet_resnet() # Compile the model. # Note that this time we are changing the target. # This is because we want to translate the host library into JavaScript # through Emscripten. graph, lib, params = compile_net( net, target_host="llvm -target=asmjs-unknown-emscripten -system-lib", target="opengl", data_shape=data_shape, params=params) # Now we want to deploy our model through RPC. # First we ned to prepare the module files locally. print("Saving the compiled module...") temp = util.tempdir() path_obj = temp.relpath("deploy.bc") # host LLVM part path_dso = temp.relpath("deploy.js") # host JavaScript part path_gl = temp.relpath("deploy.gl") # device GLSL part path_json = temp.relpath("deploy.tvm_meta.json") lib.save(path_obj) emscripten.create_js(path_dso, path_obj, side_module=True) lib.imported_modules[0].save(path_gl) print("- Saved files:", temp.listdir()) # Connect to the RPC server. print("Connecting to RPC server...") proxy_host = 'localhost' proxy_port = 9090 remote = rpc.connect(proxy_host, proxy_port, key="js") print("- Connected to RPC server!") # Upload module to RPC server. print("Uploading module to RPC server...") remote.upload(path_dso, "deploy.dso") remote.upload(path_gl) remote.upload(path_json) print("- Upload completed!") # Load remote library. print("Loading remote library...") fdev = remote.load_module("deploy.gl") fhost = remote.load_module("deploy.dso") fhost.import_module(fdev) rlib = fhost print("- Remote library loaded!") ctx = remote.opengl(0) # Upload the parameters. print("Uploading parameters...") rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} print("- Parameters uploaded!") # Create the remote runtime module. print("Running remote module...") from tvm.contrib import graph_runtime module = graph_runtime.create(graph, rlib, ctx) # Set parameter. module.set_input(**rparams) # Set input data. input_data = np.random.uniform(size=data_shape) module.set_input('data', tvm.nd.array(input_data.astype('float32'))) # Run. module.run() print("- Remote module execution completed!") out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx)) # Print first 10 elements of output. print(out.asnumpy()[0][0:10])
##--------------------------------- ## Under debugging ##--------------------------------- import cv2 import time inshape = (1,3,224,224) outshape = (1,1000) basename="models/alexnet/deploy_rasp" loaded_params = bytearray(open(basename+".params", "rb").read()) # connect the server remote = rpc.connect("raspberrypi.local", 9090) # upload the library to remote device and load it lib_fname='models/alexnet/deploy_rasp.o' print("uploading") remote.upload(lib_fname) print("loading module") rlib = remote.load_module("deploy_rasp.o") print("loading graph") graph = open(basename+".json").read() ctx = remote.cpu(0) # upload the parameter print("loading paramdict") params = nnvm.compiler.load_param_dict(loaded_params) print("converting paramdict")
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert(bn <= N) assert(num_thread * num_thread * 16 <= N) assert(num_block * num_block * 2 <= N) A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='Btmp') k = tvm.reduce_axis((0, N), name='k') packedB = tvm.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name = 'B') C = tvm.compute( (N, N), lambda ii, jj: tvm.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = tvm.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis("threadIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
# It is recommended to set target triple and feature set to contain specific # feature available, so we can take full advantage of the features of the # board. # You can find more details about cross compilation attributes from # `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_. ###################################################################### # Run Kernel Remotely by RPC # -------------------------- # Here we will show you how to run the kernel on the remote device: # replace host with the ip address of your device host = '0.0.0.0' port = 9090 # connect the remote device remote = rpc.connect(host, port) ###################################################################### # Here we upload the lib to the remote device, then invoke a device local # compiler for shared lib and load it into device memory. now `f` is a # remote module object. remote.upload(path) f = remote.load_module('mylib.o') # create array on the remote device ctx = remote.cpu(0) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) # the function will run on the remote device f(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()