예제 #1
0
def init_embedding_processor():
    global mod2
    global mod3
    if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2'):
        global __t
        global graph_runtime
        import tvm as __t
        from tvm.contrib import graph_runtime
        loaded_lib = None
        if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2.tar.so'):
            loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar.so')
        else:
            loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar')
        loaded_json = open(DATA_RUNTIME_FOLDER + "/net2").read()
        loaded_params = bytearray(
            open(DATA_RUNTIME_FOLDER + "/net2.params", "rb").read())

        ctx = __t.cl(0)

        mod2 = graph_runtime.create(loaded_json, loaded_lib, ctx)
        mod2.load_params(loaded_params)
        return mod2
    elif os.path.isfile('/root/model-r50-am-lfw/model-0000.params'):
        global mx
        import mxnet as mx
        ctx = mx.cpu(0)
        mod3 = get_model(ctx, [112, 112], '/root/model-r50-am-lfw/model,0',
                         'fc1')
        print('no existing model, nothing to do')
        return mod3
예제 #2
0
파일: lstm.py 프로젝트: gwli/tvm
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c],
                       target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(
         size=(num_step, batch_size, 4, num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(
         size=(4, num_hidden, num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     tstart = time.time()
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     tgap = time.time() - tstart
     print("Time cost=%g" % tgap)
예제 #3
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
     lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
     rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
     if typ == "add":
         out_npy = lhs_npy + rhs_npy
     elif typ == "sub":
         out_npy = lhs_npy - rhs_npy
     elif typ == "div":
         rhs_npy = np.abs(rhs_npy) + 0.001
         out_npy = lhs_npy / rhs_npy
     elif typ == "mul":
         out_npy = lhs_npy * rhs_npy
     elif typ == "maximum":
         out_npy = np.maximum(lhs_npy, rhs_npy)
     elif typ == "minimum":
         out_npy = np.minimum(lhs_npy, rhs_npy)
     else:
         raise NotImplementedError
     lhs_nd = tvm.nd.array(lhs_npy, ctx)
     rhs_nd = tvm.nd.array(rhs_npy, ctx)
     out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
     for _ in range(1):
         foo(lhs_nd, rhs_nd, out_nd)
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
예제 #4
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A, B], device, name="sum")
     # Test
     in_npy = np.random.uniform(size=in_shape).astype(np.float32)
     in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
     if type == "sum":
         out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
     elif type == "max":
         out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
     elif type == "min":
         out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
     elif type == "argmax":
         out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
     elif type == "argmin":
         out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
     else:
         raise NotImplementedError
     data_tvm = tvm.nd.array(in_npy, ctx=ctx)
     out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
     for _ in range(1):
         foo(data_tvm, out_tvm)
     np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
예제 #5
0
class ModelAgent:

    ctx = tvm.cl(0)
    dtype = 'float32'

    def __init__(self):
        self.graph = open('shufflenet.json').read()
        self.lib = tvm.module.load("shufflenet.tar")
        self.params = bytearray(open("shufflenet.params", "rb").read())
        # Compute with GPU
        self.mod = graph_runtime.create(self.graph, self.lib, self.ctx)
        self.mod.load_params(self.params)

    def preprocess_image(self, image):
        image = image.resize((224, 224))
        image = np.array(image) / np.array([255, 255, 255])
        image -= np.array([0.485, 0.456, 0.406])
        image /= np.array([0.229, 0.224, 0.225])
        image = image.transpose((2, 0, 1))
        image = image[np.newaxis, :]
        return image

    def execute(self, inputs):
        inputs = self.preprocess_image(inputs)
        self.mod.set_input("input", tvm.nd.array(inputs.astype(self.dtype)))
        self.mod.run()
        outputs = self.mod.get_output(0)
        return outputs
예제 #6
0
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(size=(num_step, batch_size, 4,
                                      num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(size=(4, num_hidden,
                                      num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     tstart = time.time()
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     tgap = time.time() - tstart
     print("Time cost=%g" % tgap)
예제 #7
0
def init_embedding_processor():
    global mod2
    global mod3

    if HAS_OPENCL == 'false':
        global mx
        import mxnet as mx
        print('need init mxnet')

        mod2 = None
        if os.path.isfile(DATA_RUNTIME_FOLDER + '/model-0000.params'):
            ctx = mx.cpu(0)
            mod3 = get_model(ctx, [112, 112], DATA_RUNTIME_FOLDER + '/model,0',
                             'fc1')
            print('backup model loaded')
            return mod3
    else:
        print('has opencl supporting')

    if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2'):
        global __t
        global graph_runtime
        try:
            import tvm as __t
            from tvm.contrib import graph_runtime
            loaded_lib = None
            if os.path.isfile(DATA_RUNTIME_FOLDER + '/net2.tar.so'):
                loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER +
                                             '/net2.tar.so')
            else:
                loaded_lib = __t.module.load(DATA_RUNTIME_FOLDER + '/net2.tar')
            loaded_json = open(DATA_RUNTIME_FOLDER + "/net2").read()
            loaded_params = bytearray(
                open(DATA_RUNTIME_FOLDER + "/net2.params", "rb").read())

            ctx = __t.cl(0)

            mod2 = graph_runtime.create(loaded_json, loaded_lib, ctx)
            mod2.load_params(loaded_params)
            return mod2
        except:
            print('error of loading net2')
            mod2 = None
            if os.path.isfile(DATA_RUNTIME_FOLDER + '/model-0000.params'):
                global mx
                import mxnet as mx
                ctx = mx.cpu(0)
                mod3 = get_model(ctx, [112, 112],
                                 DATA_RUNTIME_FOLDER + '/model,0', 'fc1')
                print('backup model loaded')
                return mod3
    elif os.path.isfile('/root/model-r50-am-lfw/model-0000.params'):
        global mx
        import mxnet as mx
        ctx = mx.cpu(0)
        mod3 = get_model(ctx, [112, 112], '/root/model-r50-am-lfw/model,0',
                         'fc1')
        print('backup model loaded')
        return mod3
예제 #8
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
     f = tvm.build(s, [A, B], device)
     f(a, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
예제 #9
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A, B], device, name="expand_dims")
     data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
     out_npy = data_npy.reshape(out_shape)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
     foo(data_nd, out_nd)
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #10
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A, B], device, name="reshape")
     data_npy = np.random.normal(size=src_shape).astype(A.dtype)
     out_npy = np.reshape(data_npy, newshape=dst_shape)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
     foo(data_nd, out_nd)
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #11
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A, B], device, name="tranpose")
     data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(
         A.dtype)
     out_npy = data_npy.transpose(axes)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
     foo(data_nd, out_nd)
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #12
0
    def __init__(self):
        ctx = tvm.cl(0)
        ffi = FFI()

        DATA_RUNTIME_FOLDER = os.getenv('DATA_RUNTIME_FOLDER', '/data/runtime')

        self.darknet_lib = __darknetffi__.dlopen(DATA_RUNTIME_FOLDER + '/model/yolo/libdarknet.so')
        self.net = self.darknet_lib.load_network(DATA_RUNTIME_FOLDER + "/model/yolo/yolo.cfg", ffi.NULL, 0)

        lib = tvm.module.load(DATA_RUNTIME_FOLDER + '/model/yolo/yolo.tar')
        graph = open(DATA_RUNTIME_FOLDER + "/model/yolo/yolo").read()
        params = bytearray(open(DATA_RUNTIME_FOLDER + "/model/yolo/yolo.params", "rb").read())
        self.mod = graph_runtime.create(graph, lib, ctx)
        self.mod.load_params(params)
예제 #13
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
     data_npys = [
         np.random.normal(size=shape).astype(tensor_l[0].dtype)
         for shape in shapes
     ]
     out_npy = np.concatenate(data_npys, axis=axis)
     data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
     out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
     foo(*(data_nds + [out_nd]))
     np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #14
0
    def __init__(self):
        ctx = tvm.cl(0)

        self.darknet_lib = __darknetffi__.dlopen(
            '../../model/yolo/libdarknet.so')

        lib = tvm.module.load('../../model/yolo/yolov2.tar')
        graph = open("../../model/yolo/yolov2").read()
        params = bytearray(open("../../model/yolo/yolov2.params", "rb").read())
        self.mod = graph_runtime.create(graph, lib, ctx)
        self.mod.load_params(params)
        print("mod load params successfully")

        self.parked_car_boxes = None
        self.free_space_frames = 0
        self.frame_index = 0
예제 #15
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     foo = tvm.build(s, [A] + tensor_l, device, name="split")
     data_npy = np.random.normal(size=src_shape).astype(A.dtype)
     out_npys = np.split(data_npy, indices_or_sections, axis=axis)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nds = [
         tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype)
         for out_npy in out_npys
     ]
     foo(*([data_nd] + out_nds))
     for out_nd, out_npy in zip(out_nds, out_npys):
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #16
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     f = tvm.build(s, [A, B, C], device)
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     # launch the kernel.
     n, m, l = nn, nn, nn
     a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
     b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(b_np, ctx)
     c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
     for i in range(2):
         f(a, b, c)
     np.testing.assert_allclose(
         c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
예제 #17
0
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
     a = tvm.nd.array(a_np, ctx)
     w = tvm.nd.array(w_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
     with tvm.build_config(auto_unroll_max_step=32,
                           auto_unroll_min_depth=0,
                           unroll_explicit=False):
         func1 = tvm.build(s1, [A, W, B], device)
         func1(a, w, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         func2 = tvm.build(s2, [A, W, C], device)
         func2(a, w, c)
         np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
예제 #18
0
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
     dev = tvm.cuda(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, dev)
     scan_c_a = tvm.nd.array(scan_c_np, dev)
     Xi2h_a = tvm.nd.array(Xi2h_np, dev)
     Wh2h_a = tvm.nd.array(Wh2h_np, dev)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     dev.sync()
     # measure time cost of second step.
     evaluator = flstm.time_evaluator(flstm.entry_name, dev, 1, repeat=1000)
     eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     print("Time cost=%g" % eval_result.mean)
예제 #19
0
파일: matexp.py 프로젝트: zyzhou1028/tvm
    def check_device(target):
        with tvm.transform.PassContext(
                config={
                    "tir.UnrollLoop": {
                        "auto_max_step": 128,
                    },
                    "tir.detect_global_barrier": detect_global_barrier,
                }):
            f = tvm.build(s, [s_scan, Whh], target)
        dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden // 2:] = 0

        res_a = tvm.nd.array(res_np, dev)
        Whh_a = tvm.nd.array(Whh_np, dev)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        dev.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        dev.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5:
                        print("%d, %d: %g vs %g" %
                              (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j]))
            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
예제 #20
0
파일: matexp.py 프로젝트: bddppq/tvm
    def check_device(target):
        with tvm.build_config(
                detect_global_barrier=detect_global_barrier,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden//2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i  in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                        print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
예제 #21
0
파일: lstm.py 프로젝트: bddppq/tvm
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c],
                       target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(
         size=(num_step, batch_size, 4, num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(
         size=(4, num_hidden, num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
     eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     print("Time cost=%g" % eval_result.mean)
예제 #22
0
def opencl_add():
    n = tvm.var("n")
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
    print(type(C))

    s = tvm.create_schedule(C.op)

    bx, tx = s[C].split(C.op.axis[0], factor=64)

    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))

    fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd")
    print("------opencl code------")
    print(fadd_cl.imported_modules[0].get_source())
    ctx = tvm.cl(0)
    n = 1024
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
    fadd_cl(a, b, c)
    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
예제 #23
0
#

######################################################################
# Generate OpenCL Code
# --------------------
# TVM provides code generation features into multiple backends,
# we can also generate OpenCL code or LLVM code that runs on CPU backends.
#
# The following code blocks generate OpenCL code, creates array on an OpenCL
# device, and verifies the correctness of the code.
#
if tgt.startswith("opencl"):
    fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
    print("------opencl code------")
    print(fadd_cl.imported_modules[0].get_source())
    ctx = tvm.cl(0)
    n = 1024
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
    fadd_cl(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

######################################################################
# Summary
# -------
# This tutorial provides a walk through of TVM workflow using
# a vector add example. The general workflow is
#
# - Describe your computation via a series of operations.
# - Describe how we want to compute use schedule primitives.
예제 #24
0
#   any GPUs, provided that you have compiled the code for that GPU.

################################################################################
# Generate OpenCL Code
# --------------------
# TVM provides code generation features into multiple backends. We can also
# generate OpenCL code or LLVM code that runs on CPU backends.
#
# The following code blocks generate OpenCL code, creates array on an OpenCL
# device, and verifies the correctness of the code.

if tgt.kind.name.startswith("opencl"):
    fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
    print("------opencl code------")
    print(fadd_cl.imported_modules[0].get_source())
    dev = tvm.cl(0)
    n = 1024
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
    fadd_cl(a, b, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

################################################################################
# .. note:: TE Scheduling Primitives
#
#   TVM includes a number of different scheduling primitives:
#
#   - split: splits a specified axis into two axises by the defined factor.
#   - tile: tiles will split a computation across two axes by the defined factors.
#   - fuse: fuses two consecutive axises of one computation.
sys.path.insert(0, os.path.join(thisdir, 'tvm_local/nnvm/python'))
sys.path.insert(0, os.path.join(thisdir, 'tvm_local/topi/python'))

import mxnet as mx
import nnvm
import tvm
import numpy as np
import time

print(mx.__file__)
print(nnvm.__file__)
print(tvm.__file__)

target = 'opencl'
target_to_device = {
    'opencl': tvm.cl(0),
    'llvm': tvm.cpu(0),
    'cuda': tvm.gpu(0),
}

######################################################################
# Download Resnet18 model from Gluon Model Zoo
# ---------------------------------------------
# In this section, we download a pretrained imagenet model and classify an image.
# from mxnet.gluon.model_zoo.vision import get_model
from symbols.mobilenetv2 import get_symbol
from PIL import Image
from matplotlib import pyplot as plt

model_name = 'models/mobilenetv2-1_0'
img_name = 'data/cat.jpg'
def run_case(dtype, image):
    # Check image
    import os
    import json
    import sys

    STAT_REPEAT=os.environ.get('STAT_REPEAT','')
    if STAT_REPEAT=='' or STAT_REPEAT==None:
       STAT_REPEAT=10
    STAT_REPEAT=int(STAT_REPEAT)

    # FGG: set model files via CK env
    CATEG_FILE = '../synset.txt'
    synset = eval(open(os.path.join(CATEG_FILE)).read())

    files=[]
    val={}

    if image!=None and image!='':
       files=[image]
    else:
       ipath=os.environ.get('CK_ENV_DATASET_IMAGENET_VAL','')
       if ipath=='':
          print ('Error: path to ImageNet dataset is not set!')
          exit(1)
       if not os.path.isdir(ipath):
          print ('Error: path to ImageNet dataset was not found!')
          exit(1)

       # get all files
       d=os.listdir(ipath)
       for x in d:
           x1=x.lower()
           if x1.startswith('ilsvrc2012_val_'):
              files.append(os.path.join(ipath,x))

       files=sorted(files)

       STAT_REPEAT=1

       # Get correct labels
       ival=os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT','')
       fval=open(ival).read().split('\n')

       val={}
       for x in fval:
           x=x.strip()
           if x!='':
              y=x.split(' ')
              val[y[0]]=int(y[1])

    # FGG: set timers
    import time
    timers={}

    # Get first shape (expect that will be the same for all)
    dt=time.time()
    image = Image.open(os.path.join(files[0])).resize((224, 224))
    if image.mode!='RGB': image=image.convert('RGB')
    timers['execution_time_load_image']=time.time()-dt

    dt=time.time()
    img = transform_image(image)
    timers['execution_time_transform_image']=time.time()-dt

    # load model
    from mxnet.gluon.model_zoo.vision import get_model
    from mxnet.gluon.utils import download

    model_path=os.environ['CK_ENV_MODEL_MXNET']
    model_id=os.environ['MXNET_MODEL_ID']
    block = get_model(model_id, pretrained=True, root=model_path)

    # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
    net, params = nnvm.frontend.from_mxnet(block)
    # we want a probability so add a softmax operator
    net = nnvm.sym.softmax(net)

    # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3)
    if dtype!='float32':
       params = {k: tvm.nd.array(v.asnumpy().astype(dtype)) for k, v in params.items()}

    # compile
    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(
            net, tvm.target.mali(), shape={"data": data_shape}, params=params,
            dtype=dtype, target_host=None)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    ctx = tvm.cl(0)
    rlib = lib
    rparams = params

    # create graph runtime
    dt=time.time()
    module = runtime.create(graph, rlib, ctx)
    module.set_input('data', tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)
    timers['execution_time_create_run_time_graph']=(time.time()-dt)

    total_images=0
    correct_images_top1=0
    correct_images_top5=0

    # Shuffle files and pre-read JSON with accuracy to continue aggregating it
    # otherwise if FPGA board hangs, we can continue checking random images ...

    import random
    random.shuffle(files)

    if len(files)>1 and os.path.isfile('aggregate-ck-timer.json'):
       x=json.load(open('aggregate-ck-timer.json'))

       if 'total_images' in x:
          total_images=x['total_images']
       if 'correct_images_top1' in x:
          correct_images_top1=x['correct_images_top1']
       if 'correct_images_top5' in x:
          correct_images_top5=x['correct_images_top5']

    dt1=time.time()
    for f in files:
        total_images+=1

        print ('===============================================================================')
        print ('Image '+str(total_images)+' of '+str(len(files))+' : '+f)

        image = Image.open(os.path.join(f)).resize((224, 224))
        if image.mode!='RGB': image=image.convert('RGB')
        img = transform_image(image)

        # set inputs
        module.set_input('data', tvm.nd.array(img.astype(dtype)))
        module.set_input(**rparams)

        # perform some warm up runs
        # print("warm up..")
        warm_up_timer = module.module.time_evaluator("run", ctx, 1)
        warm_up_timer()

        # execute
        print ('')
        print ("run ("+str(STAT_REPEAT)+" statistical repetitions)")
        dt=time.time()
        timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT)
        tcost = timer()
        timers['execution_time_classify']=(time.time()-dt)/STAT_REPEAT

        # get outputs
        tvm_output = module.get_output(
            0, tvm.nd.empty((1000,), dtype, ctx))

        top1 = np.argmax(tvm_output.asnumpy())

        top5=[]
        atop5 = get_top5(tvm_output.asnumpy())

        print ('')
        print('TVM prediction Top1:', top1, synset[top1])

        print ('')
        print('TVM prediction Top5:')
        for q in atop5:
            x=q[1]
            y=synset[x]
            top5.append(x)
            print (x,y)

        print ('')
        print("Internal T-cost: %g" % tcost.mean)

        # Check correctness if available
        if len(val)>0:
           top=val[os.path.basename(f)]

           correct_top1=False
           if top==top1:
              correct_top1=True
              correct_images_top1+=1

           print ('')
           if correct_top1:
              print ('Current prediction Top1: CORRECT')
           else:
              print ('Current prediction Top1: INCORRECT +('+str(top)+')')

           accuracy_top1=float(correct_images_top1)/float(total_images)
           print ('Current accuracy Top1:   '+('%.5f'%accuracy_top1))

           correct_top5=False
           if top in top5:
              correct_top5=True
              correct_images_top5+=1

           print ('')
           if correct_top5:
              print ('Current prediction Top5: CORRECT')
           else:
              print ('Current prediction Top5: INCORRECT +('+str(top)+')')

           accuracy_top5=float(correct_images_top5)/float(total_images)
           print ('Current accuracy Top5:   '+('%.5f'%accuracy_top5))

           print ('')
           print ('Total elapsed time: '+('%.1f'%(time.time()-dt1))+' sec.')

           timers['total_images']=total_images
           timers['correct_images_top1']=correct_images_top1
           timers['accuracy_top1']=accuracy_top1
           timers['correct_images_top5']=correct_images_top5
           timers['accuracy_top5']=accuracy_top5

        timers['execution_time_classify_internal']=tcost.mean
        timers['execution_time']=tcost.mean

        with open ('tmp-ck-timer.json', 'w') as ftimers:
             json.dump(timers, ftimers, indent=2)

        with open ('aggregate-ck-timer.json', 'w') as ftimers:
             json.dump(timers, ftimers, indent=2)

        sys.stdout.flush()

    return
예제 #27
0
if os.path.isfile(test_image_npy):
    print("File {} exists, skip image preprocessing.".format(test_image_npy))
    img_data = np.load(test_image_npy)
else:
    import cv2
    test_image_path = test_image
    image = cv2.imread(test_image_path)
    img_data = cv2.resize(image, (dshape[2], dshape[3]))
    img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
    img_data -= np.array([123, 117, 104])
    img_data = np.transpose(np.array(img_data), (2, 0, 1))
    img_data = np.expand_dims(img_data, axis=0)
    np.save(test_image_npy, img_data.astype(dtype))

ctx = tvm.cl()
target = "opencl"

#base = "deploy_ssd_resnet50_512/{}/".format(target)
#base = "deploy_ssd_inceptionv3_512/{}/".format(target)
#base = "deploy_ssd_mobilenet_512/{}/".format(target)
#base = "deploy_ssd_mobilenet_608/{}/".format(target)
#base = "cpu-model/"
base = "./"
path_lib = base + "model.so"
path_graph = base + "model.json"
path_param = base + "model.params"

graph = open(path_graph).read()
params = bytearray(open(path_param, "rb").read())
lib = tvm.module.load(path_lib)
예제 #28
0
#

######################################################################
# Generate OpenCL Code
# --------------------
# TVM provides code generation features into multiple backends,
# we can also generate OpenCL code or LLVM code that runs on CPU backends.
#
# The following codeblocks generate opencl code, creates array on opencl
# device, and verifies the correctness of the code.
#
if tgt == "opencl":
    fadd_cl = tvm.build(s, [A, B, C], "opencl", name="myadd")
    print("------opencl code------")
    print(fadd_cl.imported_modules[0].get_source())
    ctx = tvm.cl(0)
    n = 1024
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
    fadd_cl(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

######################################################################
# Summary
# -------
# This tutorial provides a walk through of TVM workflow using
# a vector add example. The general workflow is
#
# - Describe your computation via series of operations.
# - Describe how we want to compute use schedule primitives.
예제 #29
0
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
        # Build the kernel
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
        # Prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # Measure time cost of kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        print("Input shape = " + str(get_const_tuple(Input.shape)))
        print("Filter shape = " + str(get_const_tuple(Filter.shape)))
        print("Stride = (%d, %d)" % (stride_h, stride_w))
        print("padding = %s\n" % padding)
        print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape)))
        print("average time cost of 1000 runs (depthwise_conv2d) = %g us" %
              (tcost_1 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us"
            % (tcost_2 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us"
            % (tcost_3 * 1e6))
        # correctness
        depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
            input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
        scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape))
        for c in range(in_channel * channel_multiplier):
            scale_shift_scipy[:,
                              c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[
                                  c] + shift_np[c]
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                   scale_shift_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print("success")
def run_case(model, dtype):
    # load model
    if model == 'vgg16':
        net, params = nnvm.testing.vgg.get_workload(num_layers=16,
                                                    batch_size=1,
                                                    image_shape=image_shape,
                                                    dtype=dtype)
    elif model == 'resnet18':
        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
                                                       batch_size=1,
                                                       image_shape=image_shape,
                                                       dtype=dtype)
    elif model == 'mobilenet':
        net, params = nnvm.testing.mobilenet.get_workload(
            batch_size=1, image_shape=image_shape, dtype=dtype)
    else:
        raise ValueError('no benchmark prepared for {}.'.format(model))

    # compile
    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 tvm.target.mali(),
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 dtype=dtype,
                                                 target_host=args.target_host)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    if args.host is not None:
        remote = rpc.connect(args.host, args.port)
        remote.upload(lib_fname)

        ctx = remote.cl(0)
        rlib = remote.load_module('net.tar')
        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
    else:
        ctx = tvm.cl(0)
        rlib = lib
        rparams = params

    # create graph runtime
    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)

    # benchmark
    # print("============================================================")
    # print("model: %s, dtype: %s" % (model, dtype))

    # the num of runs for warm up and test
    num_warmup = 10
    num_test = 60
    if model == 'mobilenet':  # mobilenet is fast, need more runs for stable measureament
        num_warmup *= 5
        num_test *= 5

    # perform some warm up runs
    # print("warm up..")
    warm_up_timer = module.module.time_evaluator("run", ctx, num_warmup)
    warm_up_timer()

    # test
    # print("test..")
    ftimer = module.module.time_evaluator("run", ctx, num_test)
    prof_res = ftimer()
    # print("cost per image: %.4fs" % prof_res.mean)

    print("backend: TVM-mali\tmodel: %s\tdtype: %s\tcost:%.4f" %
          (model, dtype, prof_res.mean))
예제 #31
0
                     remote=None):
    for item in workloads:
        cost, gflops = verify_conv2d_nchw(*item,
                                          ctx=ctx,
                                          target=target,
                                          target_host=target_host,
                                          remote=remote)
        print("%-30s %.6f %.6f" % (item, cost, gflops))


#def tune_workloads(ctx, n_times=1, target=None, target_host=None, remote=None):
#    ret = []
#    for item in workloads:
#        cost, gflops, config = tune_conv2d_nchw(*item, ctx=ctx, target_host=target_host, remote=remote)
#        print(item, cost, gflops, config)
#        ret.append([item, config])
#    for item in ret:
#        print(item, config)

if __name__ == "__main__":
    host = os.environ["TVM_OPENCL_DEVICE_HOST"]
    port = 9090
    #remote = rpc.connect(host, port)
    #target_host = "llvm -target=aarch64-linux-gnu -mattr=+neon"
    target_host = None

    #verify_workloads(remote.cl(), 1000, tvm.target.mali(), target_host, remote)
    verify_workloads(tvm.cl(), 1000,
                     tvm.target.create("opencl -device=mercytest"),
                     target_host, None)