예제 #1
0
파일: testss.py 프로젝트: CyanHillFox/tvm
def test_batch_norm():
    input_shape = (1, 4, 4, 16)
    target_host = "llvm"
    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))
    inputs1 = nnvm.symbol.Variable("inputs1")
    inputs2 = nnvm.symbol.Variable("inputs2")
    z1 = nnvm.symbol.relu(inputs1)
    # z2 = nnvm.symbol.relu(z1)
    compute_graph = nnvm.graph.create(z1)
        
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":

                deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = 
                                        {"inputs1" : input_shape}, dtype = "float32", target_host = target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type = 'S0')
                    deploy_graph, lib, params = nnvm.compiler.build(compute_graph, target, shape = 
                                        {"inputs1" : input_shape}, dtype = "float32", target_host = target_host)

        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size  = (1, 4, 4, 16), low = -32, high = 32).astype(np.float32)
        b_np = np.random.uniform(size  = (1, 16), low = -32, high = 32).astype(np.float32)
        print(a_np)
        module.set_input(inputs1 = a_np)
        module.run()
        out = module.get_output(0, out = tvm.nd.empty((1, 4, 4, 16)))
        print(out.asnumpy)
        print(compute_graph.ir())
        print(deploy_graph.ir())
예제 #2
0
def test_dense():
    shape = (16, 1024)
    weight_shape = (256, 1024)
    bias_shape = (256, )
    inputs = nnvm.symbol.Variable("inputs")
    weights = nnvm.symbol.Variable("weights")
    bias = nnvm.symbol.Variable("bias")
    env = nnpu.get_env()
    target_host = "llvm"
    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))
    z = nnvm.symbol.dense(data=inputs, weight=weights, use_bias=0, units=256)
    z1 = nnvm.symbol.relu(z)
    compute_graph = nnvm.graph.create(z1)
    with nnvm.compiler.build_config(opt_level=1):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={
                    "inputs": shape,
                    "weights": weight_shape
                },
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='SC')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={
                            "inputs": shape,
                            "weights": weight_shape
                        },
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=shape)
        b_np = np.random.random(size=weight_shape)

        m.set_input(**{"inputs": a_np, "weights": b_np})
        m.run()
        gt = a_np.dot(b_np.transpose())

        out = m.get_output(0, out=tvm.nd.empty((16, 256)))
        np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-5)
        print("tests")
        print(out)
        print(compute_graph.ir())
        print(deploy_graph.ir())
예제 #3
0
def test_conv2d():
    input_shape = (1, 16, 10, 64)
    target_host = "llvm"
    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))
    inputs = nnvm.symbol.Variable("inputs")
    inputs1 = nnvm.symbol.Variable("inputs1")
    z1 = nnvm.symbol.conv2d(data=inputs,
                            channels=64,
                            kernel_size=(3, 3),
                            padding=(0, 0),
                            use_bias=False,
                            layout='NHWC',
                            kernel_layout='HWOI')
    z2 = nnvm.symbol.sigmoid(z1)
    z = nnvm.symbol.elemwise_add(z2, inputs1)

    compute_graph = nnvm.graph.create(z)

    with nnvm.compiler.build_config(opt_level=1):
        if target.device_name != "nnpu":

            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={
                    "inputs": input_shape,
                    "inputs1": (1, 14, 8, 64)
                },
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='SC')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"inputs": input_shape},
                        dtype="float32",
                        target_host=target_host)

        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size=input_shape, low=-32,
                                 high=32).astype(np.float32)
        b_np = np.random.uniform(size=(1, 14, 8, 64), low=-32,
                                 high=32).astype(np.float32)
        module.set_input(inputs=a_np)
        module.run()
        print(deploy_graph.ir())
        out = module.get_output(0, out=tvm.nd.empty((1, 14, 8, 64)))
예제 #4
0
def test_elemwise_mul():
    env = nnpu.get_env()
    device = "nnpu"
    target_host = "llvm"
    target = tvm.target.create("llvm -device={}".format(device))
    inputs1 = nnvm.symbol.Variable("inputs1")
    inputs2 = nnvm.symbol.Variable("inputs2")
    shape = (16, 6, 16)
    z = nnvm.symbol.elemwise_mul(inputs1, inputs2)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={
                    "inputs1": shape,
                    "inputs2": shape
                },
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={
                            "inputs1": shape,
                            "inputs2": shape
                        },
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random((16, 6, 16))
        b_np = np.random.random((16, 6, 16))
        print("a_np : ")
        print(a_np)
        print("b_np : ")
        print(b_np)
        m.set_input(**{"inputs1": a_np, "inputs2": b_np})
        gt = (a_np.astype("float32") *
              b_np.astype("float32")).astype("float32")
        m.run()
        out = m.get_output(0, out=tvm.nd.empty((16, 6, 16)))
        np.testing.assert_allclose(out.asnumpy(), gt)
        print("elemwise_mul tests success")
        print(out)
예제 #5
0
def test_max_pool2d():

    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))
    target_host = "llvm"
    inputs = nnvm.symbol.Variable("inputs")
    shape = (1, 224, 224, 16)
    kernels = nnvm.symbol.Variable("kernels")
    kernel_shape = (2, 2)
    z = nnvm.symbol.avg_pool2d(inputs,
                               pool_size=(2, 2),
                               strides=(1, 1),
                               layout="NHWC")
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"inputs": shape},
                dtype="float32")
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"inputs": shape},
                        dtype="float32")
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=(1, 224, 224, 16))
        m.set_input(**{"inputs": a_np})
        m.run()

        out = m.get_output(0, out=tvm.nd.empty((1, 223, 223, 16)))
        gt = avg_pooling((1, 224, 224, 16), (1, 223, 223, 16), (2, 2), a_np,
                         (1, 1), "float32")
        np.testing.assert_allclose(out.asnumpy(), gt, rtol=5e-7)
        print("max_pool2d tests success")
        print(gt)
        print(out)
        print("end")
예제 #6
0
def test_log():
    env = nnpu.get_env()
    shape = (1, 22, 22, 16)
    device = "nnpu"
    target_host = "llvm"
    target = tvm.target.create("llvm -device={}".format(device))
    inputs = nnvm.symbol.Variable("inputs")
    z = nnvm.symbol.log(inputs)
    z1 = nnvm.symbol.exp(z)
    compute_graph = nnvm.graph.create(z1)
    with nnvm.compiler.build_config(opt_level=1):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"inputs": shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"inputs": shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(shape)
        print(a_np)
        m.set_input(**{"inputs": a_np})
        m.run()
        out = m.get_output(0, out=tvm.nd.empty(shape))
        gt = np.exp(np.log(
            a_np.astype("float32")).astype("float32")).astype("float32")
        print(out)
        np.testing.assert_allclose(out.asnumpy(), gt)
        print("log tests success")
        print(compute_graph.ir())
        print(deploy_graph.ir())
예제 #7
0
def test_relu():
    shape = (2, 16)
    inputs = nnvm.symbol.Variable("inputs")
    env = nnpu.get_env()
    target_host = "llvm"
    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))
    z = nnvm.symbol.relu(inputs)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"inputs": shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"inputs": shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=(2, 16)).astype("float32") - 0.5
        m.set_input(**{'inputs': a_np})
        m.run()
        out = m.get_output(0, out=tvm.nd.empty((2, 16)))
        print(a_np)
        print(out.dtype)
        print(out)
        np.testing.assert_allclose(out.asnumpy(), np.maximum(a_np, 0))
        print("tests")
        print(compute_graph.ir())
        print(deploy_graph.ir())
예제 #8
0
def test_onemore():
    shape = (1, 32, 32, 16)
    inputs = nnvm.symbol.Variable("inputs")
    env = nnpu.get_env()
    target_host = "llvm"
    device = "nnpu"
    target = tvm.target.create("llvm -device={}".format(device))

    z1 = nnvm.symbol.relu(inputs)
    z = nnvm.symbol.sqrt(z1)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"inputs": shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"inputs": shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        m = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=(1, 32, 32, 16))
        m.set_input(**{'inputs': a_np})
        m.run()
        out = m.get_output(0, out=tvm.nd.empty((1, 32, 32, 16)))
        print(out)
        print(compute_graph.ir())
        print(deploy_graph.ir())
예제 #9
0
def test_vgg():
    def get_feature(internel_layer, layers, filters, batch_norm=False):
        """
		Get VGG feature body as stacks of convoltions.
		layers  : [1, 1, 2, 2, 2]
		filters : [64, 128, 256, 512, 512]
		"""
        for i, num in enumerate(layers):
            """
			i = 0, num = 1
			i = 1, num = 1
			i = 2, num = 2
			i = 3, num = 2
			i = 4, num = 2
			"""
            for j in range(num):
                internel_layer = sym.pad(data=internel_layer,
                                         pad_width=((0, 0), (1, 1), (1, 1),
                                                    (0, 0)))
                internel_layer = sym.conv2d(data=internel_layer,
                                            kernel_size=(3, 3),
                                            channels=filters[i],
                                            layout='NHWC',
                                            kernel_layout='HWOI',
                                            name="conv%s_%s" % (i + 1, j + 1))
                if batch_norm:
                    internel_layer = sym.batch_norm(data=internel_layer,
                                                    axis=3,
                                                    name="bn%s_%s" %
                                                    (i + 1, j + 1))
                internel_layer = sym.relu(data=internel_layer,
                                          name="relu%s_%s" % (i + 1, j + 1))

            internel_layer = sym.max_pool2d(data=internel_layer,
                                            pool_size=(2, 2),
                                            strides=(2, 2),
                                            layout="NHWC",
                                            name="pool%s" % (i + 1))
            return internel_layer

    def get_classifier(input_data, num_classes):
        """
		Get VGG classifier layers as fc layers.
		"""
        flatten = sym.flatten(data=input_data, name="flatten")
        fc1 = sym.dense(data=flatten, units=32, name="fc1")
        relu1 = sym.relu(data=fc1, name="relu1")
        drop1 = sym.dropout(data=relu1, rate=0.5, name="drop1")
        fc2 = sym.dense(data=drop1, units=32, name="fc2")
        relu2 = sym.relu(data=fc2, name="relu2")
        drop2 = sym.dropout(data=relu2, rate=0.5, name="drop2")
        fc3 = sym.dense(data=drop2, units=num_classes, name="fc3")
        return fc3

    def get_symbol(datas, num_classes, num_layers=11, batch_norm=False):
        """
		Parameters
		------------
		num_classes     : int, default 16
						Number of classification classes

		num_layers      : int
						Number of layers for the variant of vgg. Options are 11, 13, 16, 19

		batch_norm      : bool, default False
						Use batch normalization.

		"""
        vgg_spec = {
            11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
            13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
            16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
            19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])
        }

        if num_layers not in vgg_spec:
            raise ValueError(
                "Invalide num_layers {}. Choices are 11, 13, 16, 19.".format(
                    num_layers))
        layers, filters = vgg_spec[num_layers]
        feature = get_feature(datas, layers, filters, batch_norm)
        classifier = get_classifier(feature, num_classes)
        symbol = sym.softmax(data=classifier, name="softmax")
        return symbol

    input_shape = (1, 224, 224, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 1
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    print(compute_graph.ir())
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            nnpu.set_device(nnpu.get_env(), type='SC')
            with ScheduleProcHelper():
                with nnpu.build_config():
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size=input_shape, low=-32,
                                 high=32).astype(np.float32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        # module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)
예제 #10
0
def test_inception_v3():
    def Conv(data,
             num_filter,
             kernel=(1, 1),
             stride=(1, 1),
             pad=(0, 0),
             name=None,
             suffix=''):
        if pad[0] != 0 or pad[1] != 0:
            data = sym.pad(data=data,
                           pad_width=((0, 0), (pad[0], pad[0]),
                                      (pad[1], pad[1]), (0, 0)))
        conv = sym.conv2d(data=data,
                          channels=num_filter,
                          kernel_size=kernel,
                          strides=stride,
                          padding=(0, 0),
                          use_bias=False,
                          layout='NHWC',
                          kernel_layout='HWOI',
                          name="%s%s_conv2d" % (name, suffix))
        bn = sym.batch_norm(data=conv,
                            name="%s%s_batchnorm" % (name, suffix),
                            epsilon=2e-5,
                            axis=3)
        act = sym.relu(data=bn, name="%s%s_relu" % (name, suffix))
        return act

    def Pooling(data, kernel, stride, pad, pool_type, name):
        if pad[0] != 0 or pad[1] != 0:
            data = sym.pad(data=data,
                           pad_width=((0, 0), (pad[0], pad[0]),
                                      (pad[1], pad[1]), (0, 0)))
        if pool_type == 'max':
            return sym.max_pool2d(data=data,
                                  pool_size=kernel,
                                  strides=stride,
                                  name=name,
                                  layout='NHWC')
        if pool_type == 'avg':
            return sym.avg_pool2d(data=data,
                                  pool_size=kernel,
                                  strides=stride,
                                  name=name,
                                  layout='NHWC')
        raise ValueError("Invalid pooling type: " + pool_type)

    def Inception7A(data, num_1x1, num_3x3_red, num_3x3_1, num_3x3_2,
                    num_5x5_red, num_5x5, pool, proj, name):
        # num_1x1 = 64
        # num_3x3_red = 64
        # num_3x3_1 = 96
        # num_3x3_2 = 96
        # num_5x5_red = 48
        # num_5x5 = 64
        tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))

        tower_5x5 = Conv(data,
                         num_5x5_red,
                         name=('%s_tower' % name),
                         suffix='_conv')
        tower_5x5 = Conv(tower_5x5,
                         num_5x5,
                         kernel=(5, 5),
                         pad=(2, 2),
                         name=('%s_tower' % name),
                         suffix='_conv_1')

        tower_3x3 = Conv(data,
                         num_3x3_red,
                         name=('%s_tower_1' % name),
                         suffix="_conv")
        tower_3x3 = Conv(tower_3x3,
                         num_3x3_1,
                         kernel=(3, 3),
                         pad=(1, 1),
                         name=('%s_tower_1' % name),
                         suffix='_conv_1')
        tower_3x3 = Conv(tower_3x3,
                         num_3x3_2,
                         kernel=(3, 3),
                         pad=(1, 1),
                         name=('%s_tower_1' % name),
                         suffix='_conv_2')
        pooling = Pooling(data,
                          kernel=(3, 3),
                          stride=(1, 1),
                          pad=(1, 1),
                          pool_type=pool,
                          name=('%s_pool_%s_pool' % (pool, name)))

        cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
        concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj],
                                 axis=3,
                                 name='ch_concat_%s_chconcat' % name)
        return concat

    def Inception7B(data, num_3x3, num_d3x3_red, num_d3x3_1, num_d3x3_2, pool,
                    name):
        tower_3x3 = Conv(data,
                         num_3x3,
                         kernel=(3, 3),
                         pad=(0, 0),
                         stride=(2, 2),
                         name=('%s_conv' % name))

        tower_d3x3 = Conv(data,
                          num_d3x3_red,
                          name=('%s_tower' % name),
                          suffix='_conv')
        tower_d3x3 = Conv(tower_d3x3,
                          num_d3x3_1,
                          kernel=(3, 3),
                          pad=(1, 1),
                          stride=(1, 1),
                          name=('%s_tower' % name),
                          suffix='_conv_1')
        tower_d3x3 = Conv(tower_d3x3,
                          num_d3x3_2,
                          kernel=(3, 3),
                          pad=(0, 0),
                          stride=(2, 2),
                          name=('%s_tower' % name),
                          suffix='_conv_2')

        pooling = Pooling(data=data,
                          kernel=(3, 3),
                          stride=(2, 2),
                          pad=(0, 0),
                          pool_type="max",
                          name=('max_pool_%s_pool' % name))
        concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling],
                                 axis=3,
                                 name='ch_concat_%s_chconcat' % name)
        return concat

    def Inception7C(data, num_1x1, num_d7_red, num_d7_1, num_d7_2, num_q7_red,
                    num_q7_1, num_q7_2, num_q7_3, num_q7_4, pool, proj, name):
        tower_1x1 = Conv(data=data,
                         num_filter=num_1x1,
                         kernel=(1, 1),
                         name=('%s_conv' % name))

        tower_d7 = Conv(data=data,
                        num_filter=num_d7_red,
                        name=('%s_tower' % name),
                        suffix='_conv')
        tower_d7 = Conv(data=tower_d7,
                        num_filter=num_d7_1,
                        kernel=(1, 7),
                        pad=(0, 3),
                        name=('%s_tower' % name),
                        suffix='_conv_1')
        tower_d7 = Conv(data=tower_d7,
                        num_filter=num_d7_2,
                        kernel=(7, 1),
                        pad=(3, 0),
                        name=('%s_tower' % name),
                        suffix='_conv_2')

        tower_q7 = Conv(data=data,
                        num_filter=num_q7_red,
                        name=('%s_tower_1' % name),
                        suffix='_conv')
        tower_q7 = Conv(data=tower_q7,
                        num_filter=num_q7_1,
                        kernel=(7, 1),
                        pad=(3, 0),
                        name=('%s_tower_1' % name),
                        suffix='_conv_1')
        tower_q7 = Conv(data=tower_q7,
                        num_filter=num_q7_2,
                        kernel=(1, 7),
                        pad=(0, 3),
                        name=('%s_tower_1' % name),
                        suffix='_conv_2')
        tower_q7 = Conv(data=tower_q7,
                        num_filter=num_q7_3,
                        kernel=(7, 1),
                        pad=(3, 0),
                        name=('%s_tower_1' % name),
                        suffix='_conv_3')
        tower_q7 = Conv(data=tower_q7,
                        num_filter=num_q7_4,
                        kernel=(1, 7),
                        pad=(0, 3),
                        name=('%s_tower_1' % name),
                        suffix='_conv_4')

        pooling = Pooling(data=data,
                          kernel=(3, 3),
                          stride=(1, 1),
                          pad=(1, 1),
                          pool_type=pool,
                          name=('%s_pool_%s_pool' % (pool, name)))
        cproj = Conv(data=pooling,
                     num_filter=proj,
                     kernel=(1, 1),
                     name=('%s_tower_2' % name),
                     suffix='_conv')

        concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj],
                                 axis=3,
                                 name='ch_concat_%s_chconcat' % name)
        return concat

    def Inception7D(data, num_3x3_red, num_3x3, num_d7_3x3_red, num_d7_1,
                    num_d7_2, num_d7_3x3, pool, name):
        tower_3x3 = Conv(data=data,
                         num_filter=num_3x3_red,
                         name=('%s_tower' % name),
                         suffix='_conv')
        tower_3x3 = Conv(data=tower_3x3,
                         num_filter=num_3x3,
                         kernel=(3, 3),
                         pad=(0, 0),
                         stride=(2, 2),
                         name=('%s_tower' % name),
                         suffix='_conv_1')

        tower_d7_3x3 = Conv(data=data,
                            num_filter=num_d7_3x3_red,
                            name=('%s_tower_1' % name),
                            suffix='_conv')
        tower_d7_3x3 = Conv(data=tower_d7_3x3,
                            num_filter=num_d7_1,
                            kernel=(1, 7),
                            pad=(0, 3),
                            name=('%s_tower_1' % name),
                            suffix='_conv_1')
        tower_d7_3x3 = Conv(data=tower_d7_3x3,
                            num_filter=num_d7_2,
                            kernel=(7, 1),
                            pad=(3, 0),
                            name=('%s_tower_1' % name),
                            suffix='_conv_2')
        tower_d7_3x3 = Conv(data=tower_d7_3x3,
                            num_filter=num_d7_3x3,
                            kernel=(3, 3),
                            stride=(2, 2),
                            name=('%s_tower_1' % name),
                            suffix='_conv_3')
        pooling = Pooling(data=data,
                          kernel=(3, 3),
                          stride=(2, 2),
                          pool_type=pool,
                          pad=(0, 0),
                          name=('%s_pool_%s_pool' % (pool, name)))

        concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling],
                                 axis=3,
                                 name='ch_concat_%s_chconcat' % name)
        return concat

    def Inception7E(data, num_1x1, num_d3_red, num_d3_1, num_d3_2,
                    num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, pool,
                    proj, name):

        tower_1x1 = Conv(data=data,
                         num_filter=num_1x1,
                         kernel=(1, 1),
                         name=('%s_conv' % name))

        tower_d3 = Conv(data=data,
                        num_filter=num_d3_red,
                        name=('%s_tower' % name),
                        suffix='_conv')
        tower_d3_a = Conv(data=tower_d3,
                          num_filter=num_d3_1,
                          kernel=(1, 3),
                          pad=(0, 1),
                          name=('%s_tower' % name),
                          suffix='_mixed_conv')
        tower_d3_b = Conv(data=tower_d3,
                          num_filter=num_d3_2,
                          kernel=(3, 1),
                          pad=(1, 0),
                          name=('%s_tower' % name),
                          suffix='_mixed_conv_1')

        tower_3x3_d3 = Conv(data=data,
                            num_filter=num_3x3_d3_red,
                            name=('%s_tower_1' % name),
                            suffix='_conv')
        tower_3x3_d3 = Conv(data=tower_3x3_d3,
                            num_filter=num_3x3,
                            kernel=(3, 3),
                            pad=(1, 1),
                            name=('%s_tower_1' % name),
                            suffix='_conv_1')
        tower_3x3_d3_a = Conv(data=tower_3x3_d3,
                              num_filter=num_3x3_d3_1,
                              kernel=(1, 3),
                              pad=(0, 1),
                              name=('%s_tower_1' % name),
                              suffix='_mixed_conv')
        tower_3x3_d3_b = Conv(data=tower_3x3_d3,
                              num_filter=num_3x3_d3_2,
                              kernel=(3, 1),
                              pad=(1, 0),
                              name=('%s_tower_1' % name),
                              suffix='_mixed_conv_1')

        pooling = Pooling(data=data,
                          kernel=(3, 3),
                          stride=(1, 1),
                          pad=(1, 1),
                          pool_type=pool,
                          name=('%s_pool_%s_pool' % (pool, name)))
        cproj = Conv(data=pooling,
                     num_filter=proj,
                     kernel=(1, 1),
                     name=('%s_tower_2' % name),
                     suffix='_conv')

        concat = sym.concatenate(*[
            tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b,
            cproj
        ],
                                 axis=3,
                                 name='ch_concat_%s_chconcat' % name)
        return concat

    def get_symbol(data, num_classes=16, **kwargs):
        conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
        conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
        conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
        pool = Pooling(data=conv_2,
                       kernel=(3, 3),
                       stride=(2, 2),
                       pool_type="max",
                       pad=(0, 0),
                       name="pool")
        conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
        conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
        pool1 = Pooling(data=conv_4,
                        kernel=(3, 3),
                        stride=(2, 2),
                        pool_type="max",
                        pad=(0, 0),
                        name="pool1")

        in3a = Inception7A(pool1, 64, 64, 96, 96, 48, 64, "avg", 32, "mixed")
        in3b = Inception7A(in3a, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_1")

        in3c = Inception7A(in3b, 64, 64, 96, 96, 48, 64, "avg", 64, "mixed_2")

        in3d = Inception7B(in3c, 384, 64, 96, 96, "max", "mixed_3")

        in4a = Inception7C(in3d, 192, 128, 128, 192, 128, 128, 128, 128, 192,
                           "avg", 192, "mixed_4")

        in4b = Inception7C(in4a, 192, 160, 160, 192, 160, 160, 160, 160, 192,
                           "avg", 192, "mixed_5")
        in4c = Inception7C(in4b, 192, 160, 160, 192, 160, 160, 160, 160, 192,
                           "avg", 192, "mixed_6")
        in4d = Inception7C(in4c, 192, 192, 192, 192, 192, 192, 192, 192, 192,
                           "avg", 192, "mixed_7")

        in4e = Inception7D(in4d, 192, 320, 192, 192, 192, 192, "max",
                           "mixed_8")

        in5a = Inception7E(in4e, 320, 384, 384, 384, 448, 384, 384, 384, "avg",
                           192, "mixed_9")
        in5b = Inception7E(in5a, 320, 384, 384, 384, 448, 384, 384, 384, "max",
                           192, "mixed_10")

        pool = Pooling(data=in5b,
                       kernel=(8, 8),
                       stride=(1, 1),
                       pool_type="avg",
                       pad=(0, 0),
                       name="global_pool")
        flatten = sym.flatten(data=pool, name="flatten")
        fc1 = sym.dense(data=flatten, units=num_classes, name="fc1")
        softmax = sym.softmax(data=fc1, name="softmax")
        return softmax

    input_shape = (1, 299, 299, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 3
    z = get_symbol(data=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=1):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size=(1, 299, 299, 16), low=-32,
                                 high=32).astype(np.float32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)
예제 #11
0
    # we can move gemm into acc2buffer copy.
    xo, xi = s[out_buf].split(out_buf.op.axis[0], factor=gemm_shape[0])
    s[prod_buf].compute_at(s[out_buf], xo)
    s[out_buf].pragma(xi, env.copy_acc2buf)

    # split and tensorize VAddV.
    nvctr_unit = env.cfg['vector_unit']['size']
    xo, xi = s[res_buf].split(res_buf.op.axis[0], factor=nvctr_unit)
    s[res_buf].tensorize(xi, env.intrins.get('VAddV', mode='w'))
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    # with nnpu.build_config(dump_pass_ir=True):
    with nnpu.build_config():
        print(nnpu.lower(s, [weight, data, bias, res_host], simple_mode=True))

    func = nnpu.build(s, [weight, data, bias, res_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_func')
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=weight_shape,
                             dtype=weight.dtype,
                             low=-32,
예제 #12
0
def test_densenet():
    def Conv(datas, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)):
        if pad[0] != 0 or pad[1] != 0:
            datas = nnvm.symbol.pad(data=datas,
                                    pad_width=((0, 0), (pad[0], pad[0]),
                                               (pad[1], pad[1]), (0, 0)))
        conv = nnvm.symbol.conv2d(data=datas,
                                  kernel_size=kernel_size,
                                  channels=filter_nums,
                                  strides=stride,
                                  layout='NHWC',
                                  kernel_layout='HWOI')
        return conv

    def bottleneck_layer(datas, filters):
        bn1 = nnvm.symbol.batch_norm(data=datas, epsilon=2e-5, axis=3)
        relu1 = nnvm.symbol.relu(data=bn1)
        conv1 = Conv(datas=relu1, kernel_size=(1, 1), filter_nums=4 * filters)
        bn2 = nnvm.symbol.batch_norm(data=conv1, epsilon=2e-5, axis=3)
        relu2 = nnvm.symbol.relu(data=bn2)
        conv2 = Conv(datas=relu2,
                     kernel_size=(3, 3),
                     filter_nums=filters,
                     pad=(1, 1))
        return conv2

    def transition_layer(datas, filters):
        conv = Conv(datas=datas, kernel_size=(1, 1), filter_nums=filters)

        pool = nnvm.symbol.avg_pool2d(data=conv,
                                      pool_size=(2, 2),
                                      strides=(2, 2),
                                      layout='NHWC')
        return pool

    def dense_block(datas, filters, layers):
        layers_concat = []
        layers_concat.append(datas)
        b_l = bottleneck_layer(datas, filters)

        layers_concat.append(b_l)
        for i in range(layers - 1):
            x = nnvm.symbol.concatenate(*layers_concat, axis=3)
            x = bottleneck_layer(x, filters)
            layers_concat.append(x)
        return x

    def get_symbol(datas, num_classes=16):
        x = Conv(datas, kernel_size=(7, 7), filter_nums=32, stride=(2, 2))

        x = nnvm.symbol.max_pool2d(x,
                                   pool_size=(3, 3),
                                   strides=(2, 2),
                                   layout='NHWC')

        b1 = dense_block(x, 32, 6)

        l1 = transition_layer(b1, 32)

        b2 = dense_block(l1, 32, 12)
        l2 = transition_layer(b2, 32)
        b3 = dense_block(l2, 32, 48)
        l3 = transition_layer(b3, 32)
        b4 = dense_block(l3, 32, 32)
        x = nnvm.symbol.global_avg_pool2d(data=b4, layout='NHWC')
        x = nnvm.symbol.flatten(data=x)
        fc = nnvm.symbol.dense(data=x, units=16)
        symbol = nnvm.symbol.softmax(data=fc)
        return symbol

    input_shape = (1, 229, 229, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 3
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.random(size=input_shape)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        module.run()

        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean)
예제 #13
0
def test_Alexnet():
    def Conv(data, kernel_size, filter_nums, stride=(1, 1), pad=(0, 0)):
        if pad[0] != 0 or pad[1] != 0:
            data = nnvm.symbol.pad(data=data,
                                   pad_width=((0, 0), (pad[0], pad[0]),
                                              (pad[1], pad[1]), (0, 0)))
        datas = nnvm.symbol.conv2d(data=data,
                                   kernel_size=kernel_size,
                                   channels=filter_nums,
                                   strides=stride,
                                   layout='NHWC',
                                   kernel_layout='HWOI')
        datas = nnvm.symbol.relu(data=datas)
        return datas

    def get_symbol(datas, num_classes):
        conv1 = Conv(data=datas,
                     kernel_size=(11, 11),
                     filter_nums=96,
                     stride=(4, 4))
        pool1 = nnvm.symbol.max_pool2d(data=conv1,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        conv2 = Conv(data=pool1,
                     kernel_size=(5, 5),
                     filter_nums=256,
                     pad=(2, 2))
        pool2 = nnvm.symbol.max_pool2d(data=conv2,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        conv3 = Conv(data=pool2,
                     kernel_size=(3, 3),
                     filter_nums=384,
                     pad=(1, 1))
        conv4 = Conv(data=conv3,
                     kernel_size=(3, 3),
                     filter_nums=384,
                     pad=(1, 1))
        conv5 = Conv(data=conv4,
                     kernel_size=(3, 3),
                     filter_nums=256,
                     pad=(1, 1))
        pool3 = nnvm.symbol.max_pool2d(data=conv5,
                                       pool_size=(3, 3),
                                       strides=(2, 2),
                                       layout='NHWC')
        datas = nnvm.symbol.flatten(data=pool3)
        fc1 = nnvm.symbol.dense(data=datas, units=1024)
        relu1 = nnvm.symbol.relu(data=fc1)
        drop1 = nnvm.symbol.dropout(data=relu1, rate=0.5)
        fc2 = nnvm.symbol.dense(data=drop1, units=1024)
        relu2 = nnvm.symbol.relu(data=fc2)
        drop2 = nnvm.symbol.dropout(data=relu2, rate=0.5)
        fc3 = nnvm.symbol.dense(data=drop2, units=16)
        symbol = nnvm.symbol.softmax(fc3)
        return symbol

    input_shape = (1, 128, 128, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 1
    z = get_symbol(datas=data, num_classes=16)
    compute_graph = nnvm.graph.create(z)
    print(compute_graph.ir())
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":
            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='SC')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.randint(size=input_shape, low=-32, high=32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)
        # module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)
예제 #14
0
def test_resnets():
    def residual_unit(data,
                      num_filter,
                      stride,
                      dim_match,
                      name,
                      bottle_neck=True):
        # bottle_neck : False
        """
        Return Resnet Unit symbol for building Resnet
        Parameters
        ------------
        data       :  str
                    Input data
        
        num_filter :  int
                    Number of output channels
        
        stride     :  tuple
                    Stride used in convolution
                
        dim_match  :  Boolean
                    True means channel number between input and output is the same
                    otherwise means differ
        """
        if bottle_neck:
            bn1 = nnvm.symbol.batch_norm(data=data,
                                         axis=3,
                                         epsilon=2e-5,
                                         name=name + '_bn1')
            act1 = nnvm.symbol.relu(data=bn1, name=name + '_relu1')
            conv1 = nnvm.symbol.conv2d(data=act1,
                                       channels=int(num_filter * 0.25),
                                       kernel_size=(1, 1),
                                       strides=stride,
                                       padding=(0, 0),
                                       use_bias=False,
                                       layout='NHWC',
                                       kernel_layout='HWOI',
                                       name=name + '_conv1')
            bn2 = nnvm.symbol.batch_norm(data=conv1,
                                         axis=3,
                                         epsilon=2e-5,
                                         name=name + '_bn2')
            act2 = nnvm.symbol.relu(data=bn2, name=name + '_relu2')

            pad = nnvm.symbol.pad(data=act2,
                                  pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
            conv2 = nnvm.symbol.conv2d(data=pad,
                                       channels=int(num_filter * 0.25),
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       padding=(0, 0),
                                       use_bias=False,
                                       layout='NHWC',
                                       kernel_layout='HWOI',
                                       name=name + '_conv2')

            bn3 = nnvm.symbol.batch_norm(data=conv2,
                                         axis=3,
                                         epsilon=2e-5,
                                         name=name + '_bn3')
            act3 = nnvm.symbol.relu(data=bn3, name=name + '_relu3')
            conv3 = nnvm.symbol.conv2d(data=act3,
                                       channels=num_filter,
                                       kernel_size=(1, 1),
                                       strides=(1, 1),
                                       padding=(0, 0),
                                       use_bias=False,
                                       layout='NHWC',
                                       kernel_layout='HWOI',
                                       name=name + '_conv3')

            if dim_match:
                shortcut = data
            else:
                shortcut = nnvm.symbol.conv2d(data=act1,
                                              channels=num_filter,
                                              kernel_size=(1, 1),
                                              strides=stride,
                                              use_bias=False,
                                              layout='NHWC',
                                              kernel_layout='HWOI',
                                              name=name + '_sc')
            return nnvm.symbol.elemwise_add(conv3, shortcut)
        else:
            # bottle_neck = False
            # i = 0 : filter_list[1] = 64, (1, 1), False
            # i = 1 : filter_list[2] = 128, (2, 2), False
            # i = 2 : filter_list[3] = 256, (2, 2), False
            # i = 3 : filter_list[4] = 512, (2, 2), False
            # bn1 = nnvm.symbol.batch_norm(data = data, axis = 3, epsilon = 2e-5, name = name + '_bn1')
            act1 = nnvm.symbol.relu(data=data, name=name + '_relu1')
            # (56, 56, 64)
            # num_filter = filter_list[1] = 64
            # strides = (1, 1)
            pad1 = nnvm.symbol.pad(data=act1,
                                   pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
            conv1 = nnvm.symbol.conv2d(data=pad1,
                                       channels=num_filter,
                                       kernel_size=(3, 3),
                                       strides=stride,
                                       padding=(0, 0),
                                       use_bias=False,
                                       layout='NHWC',
                                       kernel_layout='HWOI',
                                       name=name + '_bn2')
            # i = 0 : (56, 56, 64)
            # i = 1 : (28, 28, 128)
            # i = 2 : (14, 14, 256)
            # i = 3 : (7, 7, 512)
            # bn2 = nnvm.symbol.batch_norm(data = conv1, axis = 3, epsilon = 2e-5, name = name + '_bn2')
            act2 = nnvm.symbol.relu(data=conv1, name=name + '_relu2')
            # (56, 56, 64)
            pad2 = nnvm.symbol.pad(data=act2,
                                   pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
            conv2 = nnvm.symbol.conv2d(data=pad2,
                                       channels=num_filter,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       padding=(0, 0),
                                       use_bias=False,
                                       layout='NHWC',
                                       kernel_layout='HWOI',
                                       name=name + '_conv2')
            # i = 0 : (56, 56, 64)
            # i = 1 : (28, 28, 128)
            # i = 2 : (14, 14, 256)
            if dim_match:
                shortcut = data
            else:
                shortcut = nnvm.symbol.conv2d(data=act1,
                                              channels=num_filter,
                                              kernel_size=(1, 1),
                                              strides=stride,
                                              use_bias=False,
                                              layout='NHWC',
                                              kernel_layout='HWOI',
                                              name=name + '_sc')
            return nnvm.symbol.elemwise_add(conv2, shortcut)

    def resnet(datas,
               units,
               num_stages,
               filter_list,
               num_classes,
               image_shape,
               bottle_neck=True):
        # units = [2, 2, 2, 2]
        # num_stages = 4
        # filter_list = [64, 64, 128, 256, 512]
        # num_classes = 1000
        # image_shape = (224, 224, 16)
        # bottle_neck = False
        """
        Return Resnet symbol of
        Parameters
        ------------
        units       : list
                        Number of units in each stage
        
        num_stage   : int
                        Number of stage
                        
        filter_list : list
                        Channel size of each stage
        
        num_classes : int
                        Output size of symbol
        """
        num_unit = len(units)
        assert num_unit == num_stages

        data = nnvm.symbol.batch_norm(data=datas,
                                      axis=3,
                                      epsilon=2e-5,
                                      scale=False,
                                      name="bn_data")

        (_, height, _, _) = image_shape
        if height <= 32:
            pad = nnvm.symbol.pad(data=data,
                                  pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
            body = nnvm.symbol.conv2d(data=pad,
                                      channels=filter_list[0],
                                      kernel_size=(3, 3),
                                      strides=(1, 1),
                                      padding=(1, 1),
                                      use_bias=False,
                                      layout='NHWC',
                                      kernel_layout='HWOI',
                                      name="conv0")
        else:
            pad = nnvm.symbol.pad(data=data,
                                  pad_width=((0, 0), (3, 3), (3, 3), (0, 0)))

            body = nnvm.symbol.conv2d(data=pad,
                                      channels=filter_list[0],
                                      kernel_size=(7, 7),
                                      strides=(2, 2),
                                      padding=(0, 0),
                                      use_bias=False,
                                      layout='NHWC',
                                      kernel_layout='HWOI',
                                      name="conv0")
            # body.shape = (112, 112, 64)
            # body = nnvm.symbol.batch_norm(data = body, axis = 3, epsilon = 2e-5, name = "bn0")
            body = nnvm.symbol.relu(data=body, name="relu0")
            # body = nnvm.symbol.pad(data = body, pad_width = ((0, 0), (1, 1), (1, 1), (0, 0)))
            body = nnvm.symbol.max_pool2d(data=body,
                                          pool_size=(3, 3),
                                          strides=(2, 2),
                                          layout='NHWC')

            # body.shape = (56, 56, 64)
        for i in range(num_stages):
            # num_stages == 4
            # i = 0: (56, 56, 64)
            # i = 1: filter_list[2] = 128, (2, 2), False (28, 28, 128)
            # i = 2: filter_list[3] = 256, (2, 2), False
            # i = 3: filter_list[4] = 512, (2, 2), False

            body = residual_unit(body,
                                 filter_list[i + 1],
                                 (1 if i == 0 else 2, 1 if i == 0 else 2),
                                 False,
                                 name='stage%d_unit%d' % (i + 1, 1),
                                 bottle_neck=bottle_neck)
            # (56, 56, 64)
            # units[0] - 1 = 1
            for j in range(units[i] - 1):
                body = residual_unit(body,
                                     filter_list[i + 1], (1, 1),
                                     True,
                                     name="stage%d_unit%d" % (i + 1, j + 2),
                                     bottle_neck=bottle_neck)
                # (56, 56, 64)
        # (7, 7, 512)
        # bn1 = nnvm.symbol.batch_norm(data = body, axis = 3, epsilon = 2e-5, name = "bn1")
        relu1 = nnvm.symbol.relu(data=body, name="relu1")
        pool1 = nnvm.symbol.global_avg_pool2d(data=relu1,
                                              layout='NHWC',
                                              name="pool1")
        # (1, 1, 512)
        flat = nnvm.symbol.flatten(data=pool1)
        # (512)
        fc1 = nnvm.symbol.dense(data=flat, units=num_classes, name='fc1')

        return nnvm.symbol.softmax(data=fc1, name='softmax')

    def get_symbol(datas,
                   num_classes,
                   num_layers=50,
                   image_shape=(1, 224, 224, 16),
                   **kwargs):
        (_, height, _, _) = image_shape
        if height <= 28:
            num_stages = 3
            if (num_layers - 2) % 9 == 0 and num_layers >= 164:
                per_unit = [(num_layers - 2) // 9]
                filter_list = [16, 64, 128, 256]
                bottle_neck = True
            elif (num_layers - 2) % 6 == 0 and num_layers < 164:
                per_unit = [(num_layers - 2) // 6]
                filter_list = [16, 16, 32, 64]
                bottle_neck = False
            else:
                raise ValueError(
                    "no experiments done on num_layers {}".format(num_layers))
            units = per_unit * num_stages
        else:
            print("height = 224 > 28")
            # height = 224 > 28
            if num_layers >= 50:
                filter_list = [64, 256, 512, 1024, 2048]

                bottle_neck = True
            else:
                print("num_layers = 18 < 50")
                # num_layers = 18 < 50
                filter_list = [64, 64, 128, 256, 512]
                bottle_neck = False
            num_stages = 4
            if num_layers == 18:
                units = [2, 2, 2, 2]
            elif num_layers == 34:
                units = [3, 4, 6, 3]
            elif num_layers == 50:
                units = [3, 4, 6, 3]
            elif num_layers == 101:
                units = [3, 4, 23, 3]
            elif num_layers == 152:
                units = [3, 8, 36, 3]
            elif num_layers == 200:
                units = [3, 24, 36, 3]
            elif num_layers == 269:
                units = [3, 30, 48, 8]
            else:
                raise ValueError(
                    "no experiments done on num_layers {}".format(num_layers))
            return resnet(datas=datas,
                          units=units,
                          num_stages=num_stages,
                          filter_list=filter_list,
                          num_classes=num_classes,
                          image_shape=image_shape,
                          bottle_neck=bottle_neck)

    input_shape = (1, 224, 224, 16)
    target_host = "llvm"
    device = "nnpu"
    data = nnvm.symbol.Variable(name="data")
    target = tvm.target.create("llvm -device={}".format(device))
    print("ok")
    num_runs = 3
    z = get_symbol(datas=data,
                   num_classes=16,
                   num_layers=18,
                   image_shape=(1, 224, 224, 16))
    compute_graph = nnvm.graph.create(z)
    print(compute_graph.ir())
    with nnvm.compiler.build_config(opt_level=0):
        if target.device_name != "nnpu":

            deploy_graph, lib, params = nnvm.compiler.build(
                compute_graph,
                target,
                shape={"data": input_shape},
                dtype="float32",
                target_host=target_host)
        else:
            with ScheduleProcHelper():
                with nnpu.build_config():
                    nnpu.set_device(nnpu.get_env(), type='S0')
                    deploy_graph, lib, params = nnvm.compiler.build(
                        compute_graph,
                        target,
                        shape={"data": input_shape},
                        dtype="float32",
                        target_host=target_host)
        print(deploy_graph.ir())
        ctx = tvm.context(str("nnpu"), 0) if device == "nnpu" else tvm.context(
            str("llvm"), 0)
        module = runtime.create(deploy_graph, lib, ctx)
        a_np = np.random.uniform(size=(1, 224, 224, 16), low=-32,
                                 high=32).astype(np.float32)
        print(a_np)
        module.set_input(data=a_np)
        ftimer = module.module.time_evaluator("run",
                                              ctx,
                                              number=num_runs,
                                              repeat=1)

        module.run()
        out = module.get_output(0, out=tvm.nd.empty((1, 16)))
        print(out.asnumpy)
        print(deploy_graph.ir())
        print(ftimer().mean * 10)