示例#1
0
    def test_relu(self):

        c = Config(None, None, 4, 4)

        io_in = InOut("in", "static", np.array([1, 2, 3, 4]), (4))

        io_out = InOut("out", "dynamic", None, (4))

        am = {"out": np.ndarray((4))}
        inp = {"X": io_in}
        oup = {"Y": io_out}

        n = Node(0, ops.RELU, inp, oup, {}, 0)

        fn = kernels.relu_cpu(n, am, c)

        # eval
        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [1, 2, 3, 4])

        # copy new static input in
        np.copyto(io_in.data, [-2, 2, -1, 1])

        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [0, 2, 0, 1])

        np.copyto(io_in.data, [-2, -2, -1, -100000])

        fn()
        np.testing.assert_array_equal(io_out.get_data(am), [0, 0, 0, 0])
示例#2
0
    def test_copy(self):

        c = Config(None, None, 4, 4)

        size = (4,3,224,224)

        io_in = InOut("in", "static", np.random.random(size), size)

        io_gpu = InOut("gpu", "dynamic", None, size)

        io_return = InOut("return", "dynamic", None, size)

        with cupy.cuda.Device(0):
            gpu_buffer = cupy.ndarray((size))

        am = {"gpu": gpu_buffer,
              "return": np.ndarray((size))}

        inp_c0 = {"X": io_in}
        oup_c0 = {"Z": io_gpu}


        inp_c1 = {"X": io_gpu}
        oup_c1 = {"Z": io_return}

        c0 = Node(0, ops.O2P_COPY, inp_c0, oup_c0, {}, 0)
        c0.device_id = 0
        c1 = Node(0, ops.O2P_COPY, inp_c1, oup_c1, {}, 0)
        c1.device_id = 0

        fn_c0 = kernels.copy(c0, am, c)
        fn_c1 = kernels.copy(c1, am, c)

        #copy to gpu
        fn_c0()

        #execute +1
        cupy.copyto(gpu_buffer,gpu_buffer + 1)

        #copy back
        fn_c1()

        ref_plus_one = io_in.get_data(am) + 1

        cupy.testing.assert_array_equal(io_gpu.get_data(am), ref_plus_one)
        np.testing.assert_equal(io_return.get_data(am), ref_plus_one)
示例#3
0
    def test_conv_stride(self):

        c = Config(None, None, 4, 4)

        io_in = InOut("in", "static", np.ndarray((4, 3, 22, 22)), (4, 3, 22, 22))

        io_kern = InOut("kern", "static", np.ndarray((1, 3, 3, 3)), (1, 3, 3, 3))

        io_bias = InOut("bias", "static", np.ndarray((1)), (1))

        io_out = InOut("out", "dynamic", None, (4, 1, 10, 10))

        i = np.random.random(np.shape(io_in.data))
        w = np.random.random(np.shape(io_kern.data))
        b = np.random.random(np.shape(io_bias.data))

        np.copyto(io_in.data, i)
        np.copyto(io_kern.data, w)
        np.copyto(io_bias.data, b)

        # ---TEST 3: X,W,B default attrs
        am = {"out": np.ndarray((4, 1, 10, 10))}
        inp = {"X": io_in, "W": io_kern, "B": io_bias}
        oup = {"Y": io_out}
        attrs = {
            "dilations": (1, 1),
            "group": (1),
            "kernel_shape": (3, 3),
            "pads": (0, 0, 0, 0),
            "strides": (2, 2, 2, 2),
        }

        n = Node(0, ops.CONV, inp, oup, attrs, 0)
        fn = kernels.conv_cpu(n, am, c)

        # chainer with previous config
        o = chainer.functions.convolution_2d(
            i, w, b=b, stride=(2, 2), pad=(0, 0), dilate=(1, 1), groups=1
        ).array
        fn()

        np.testing.assert_array_almost_equal(o, io_out.get_data(am))
示例#4
0
    def test_maxpool_big_stride(self):

        B = 4
        C = 4
        H = 22
        W = 22

        K_size = (3, 3)

        in_shape = (B, C, H, W)
        out_shape = (B, C, 7, 7)

        c = Config(None, None, B, B)

        io_in = InOut("in", "static", np.ndarray(in_shape), in_shape)
        io_out = InOut("out", "dynamic", None, out_shape)

        i = np.random.random(np.shape(io_in.data))

        np.copyto(io_in.data, i)

        ref_mod = torch.nn.MaxPool2d(
            K_size, stride=3, dilation=1, padding=0, ceil_mode=False
        )

        torch_i = torch.tensor(i)
        ref = ref_mod(torch_i).numpy()

        am = {"out": np.ndarray(out_shape)}
        inp = {"X": io_in}
        oup = {"Y": io_out}
        attrs = {"kernel_shape": K_size, "strides": (3, 3, 3, 3)}

        n = Node(0, ops.MAXPOOL, inp, oup, attrs, 0)

        test_fn = kernels.maxpool_cpu(n, am, c)

        test_fn()

        np.testing.assert_array_almost_equal(io_out.get_data(am), ref)
示例#5
0
文件: backend.py 项目: benghaem/MutNN
def opt_graph_split(graph: nx.DiGraph, alloc_map: Dict[str, np.ndarray],
                    config: Config) -> None:

    # add the generic head node to the graph
    # connect it to the initial root node generated by frontend
    graph.add_node(PNO_GRAPH_HEAD_ID)
    graph.add_edge(PNO_GRAPH_HEAD_ID, 0)

    graph.nodes[PNO_GRAPH_HEAD_ID]["node"] = Node(-1, ops.O2P_GRAPH_HEAD, {},
                                                  {}, {}, 0)

    # need to rename and assign to the correct device
    cuda_devices = get_valid_cuda_devices()
    num_cuda = len(cuda_devices)

    config.computed_batch_size = num_cuda * config.user_width
    # there is now +1 node in the graph because of the -1 head
    new_gnode = graph.number_of_nodes() - 1

    if num_cuda > 0:

        # compute the correct split

        # gpu_name_maps = [{}] * num_cuda
        gpu_name_maps = [{} for i in range(num_cuda)]

        # source_gnode -> local_gnode

        # add a mapping from og graph head to graph head for all devices
        for i in range(num_cuda):
            gpu_name_maps[i][PNO_GRAPH_HEAD_ID] = PNO_GRAPH_HEAD_ID

        # start at the initial node of the non-replicated graph
        fixed_list = list(nx.topological_sort(graph))

        # skip the HEAD node
        for source_gnode in fixed_list[1:]:
            source_node = graph.nodes[source_gnode]["node"]

            gparents = list(graph.predecessors(source_gnode))

            for gpu_idx, device_id in enumerate(cuda_devices):

                device_node = build_replicated_node(source_node, new_gnode,
                                                    gpu_idx, f"_g{device_id}")

                # configure device settings for the new node
                if source_node.device_type == "gpu":
                    device_node.device_type = "gpu"
                    device_node.device_id = device_id
                else:
                    device_node.device_type = "cpu"
                    device_node.device_id = 0

                graph.add_node(new_gnode)
                graph.nodes[new_gnode]["node"] = device_node

                # look up source node parent in gpu_name_maps
                for gparent in gparents:
                    edge_source = gpu_name_maps[gpu_idx][gparent]
                    graph.add_edge(edge_source, new_gnode)

                # add ourself to the gpu name map
                gpu_name_maps[gpu_idx][source_gnode] = new_gnode

                new_gnode += 1

        # remove the og graph
        for gnode in fixed_list[1:]:
            graph.remove_node(gnode)

    return
示例#6
0
    return new_node


def build_load_node(target, io_map, usage_map, node_id):

    """
    Build a new load node and log usage in the map
    """

    inputs = {}
    outputs = {"Z": io_map[target]}
    attrs = {"batch_id": 0}
    new_node = Node(node_id, ops.O2P_LOAD, inputs, outputs, attrs, 0)
    usage_map[target]["def"].append(node_id)

    return new_node


if __name__ == "__main__":
    logging.basicConfig(filename="onnx_frontend.log", level=logging.DEBUG)

    config = Config(None, None, 4, 4)

    g = from_onnx("../example.onnx", config)

    nx.write_gml(g, "frontend.gml", str)

    print("--------")
    for gnode in g.nodes:
        g.nodes[gnode]["node"].pretty_print()
示例#7
0

class Model:
    def __init__(self, graph, config):
        self.graph = graph
        self.config = config

    def run(self):
        ptasks.spawn(placement=pcpu_cores.cpu(0))(backend.build_execute(
            self.graph, self.config))


if __name__ == "__main__":
    config = Config(
        vision_dataloaders.echo_top5,
        vision_dataloaders.get_test,
        int(sys.argv[2]),
        int(sys.argv[3]),
    )
    config.debug_passes = True
    config.use_simple_model_para = True
    config.use_data_para = False

    o2p_model = build(sys.argv[1], config)

    st = datetime.datetime.now()
    o2p_model.run()
    end = datetime.datetime.now()

    time = end - st

    with open(sys.argv[4], "a+") as f:
示例#8
0
    def test_batchnorm_defaults(self):

        B = 4
        C = 4
        H = 22
        W = 22

        K_size = (3, 3)

        in_shape = (B, C, H, W)
        out_shape = (B, C, H, W)

        c = Config(None, None, B, B)

        io_in = InOut("in", "static", np.ndarray(in_shape), in_shape)
        io_scale = InOut("scale", "static", np.ndarray((C)), (C))
        io_B = InOut("B", "static", np.ndarray((C)), (C))
        io_mean = InOut("mean", "static", np.ndarray((C)), (C))
        io_var = InOut("var", "static", np.ndarray((C)), (C))
        io_out = InOut("out", "dynamic", None, out_shape)

        np.random.seed(123)

        i = np.random.random(np.shape(io_in.data))
        s = np.random.random(np.shape(io_scale.data))
        b = np.random.random(np.shape(io_B.data))
        mean = np.random.random(np.shape(io_mean.data))
        var = np.random.random(np.shape(io_var.data))

        np.copyto(io_in.data, i)
        np.copyto(io_scale.data, s)
        np.copyto(io_B.data, b)
        np.copyto(io_mean.data, mean)
        np.copyto(io_var.data, var)

        eps = 1e-05
        momentum_torch = 0.5
        momentum_test = 0.4

        torch_i = torch.tensor(i)
        torch_w = torch.tensor(s)
        torch_b = torch.tensor(b)
        torch_mean = torch.tensor(mean)
        torch_var = torch.tensor(var)

        ref = torch.nn.functional.batch_norm(
            torch_i,
            torch_mean,
            torch_var,
            weight=torch_w,
            bias=torch_b,
            training=False,
            momentum=momentum_torch,
            eps=eps,
        ).numpy()

        ref_chainer = chainer.functions.fixed_batch_normalization(
            i,
            s,
            b,
            mean,
            var,
            eps=eps,
        ).array

        am = {"out": np.ndarray(out_shape)}
        inp = {"X": io_in, "scale": io_scale, "B": io_B, "mean": io_mean, "var": io_var}
        oup = {"Y": io_out}
        attrs = {"epsilon": eps, "momentum": momentum_test}

        n = Node(0, ops.BATCH_NORM, inp, oup, attrs, 0)

        test_fn = kernels.batchnorm_cpu(n, am, c)

        test_fn()

        #np.testing.assert_array_almost_equal(ref, ref_chainer)
        np.testing.assert_array_almost_equal(io_out.get_data(am), ref_chainer)