예제 #1
0
 def worker(rank, data, backend, expect, port_queue):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
     inp = Parameter(data)
     dist.functional.bcast_param(inp, "x")
     assert np.allclose(inp.numpy(), expect)
예제 #2
0
 def worker():
     if not mge.is_cuda_available():
         return
     port = mgb.config.create_mm_server("0.0.0.0", 0)
     assert port > 0
     res = mgb.config.create_mm_server("0.0.0.0", port)
     assert res == -1
예제 #3
0
 def worker(rank, data, backend, expect, port_queue):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
     inp = tensor(data)
     output = dist.functional.all_gather(inp, "x")
     assert np.allclose(output.numpy(), expect)
예제 #4
0
def test_correctness_use_adaptive_pooling():
    if mge.is_cuda_available():
        model_name = "mnist_model_with_test.mge"
    else:
        model_name = "mnist_model_with_test_cpu.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)
    set_execution_strategy("HEURISTIC_REPRODUCIBLE")

    run_train(model_path, False, False, max_err=1e-5, use_adaptive_pooling=True)
    run_train(model_path, True, False, max_err=1e-5, use_adaptive_pooling=True)
    run_train(model_path, True, True, max_err=1e-5, use_adaptive_pooling=True)

    # sublinear
    config = SublinearMemoryConfig(genetic_nr_iter=10)
    run_train(
        model_path,
        True,
        True,
        sublinear_memory_config=config,
        max_err=1e-5,
        use_adaptive_pooling=True,
    )

    run_eval(model_path, False, max_err=1e-7, use_adaptive_pooling=True)
    run_eval(model_path, True, max_err=1e-7, use_adaptive_pooling=True)
예제 #5
0
def test_tensor_serialization():
    with TemporaryFile() as f:
        data = np.random.randint(low=0, high=7, size=[233])
        a = Tensor(data, device="cpu0", dtype=np.int32)
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f)
        np.testing.assert_equal(a.numpy(), data)
        assert b.device.logical_name == "cpu0:0"
        assert b.dtype == np.int32

    with TemporaryFile() as f:
        a = Parameter(np.random.random(size=(233, 2)).astype(np.float32))
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f)
        assert isinstance(b, Parameter)
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f)
        assert type(b) is Tensor
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f, map_location="cpux")
        assert type(b) is Tensor
        assert "cpu" in str(b.device)
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        if mge.is_cuda_available():
            device_org = mge.get_default_device()
            mge.set_default_device("gpu0")
            a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
            mge.save(a, f)
            f.seek(0)
            mge.set_default_device("cpux")
            b = mge.load(f, map_location={"gpu0": "cpu0"})
            assert type(b) is Tensor
            assert "cpu0" in str(b.device)
            np.testing.assert_equal(a.numpy(), b.numpy())
            mge.set_default_device(device_org)

    with TemporaryFile() as f:
        a = Tensor(0)
        a.qparams.scale = Tensor(1.0)
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f)
        assert isinstance(b.qparams.scale, Tensor)
        np.testing.assert_equal(b.qparams.scale.numpy(), 1.0)
예제 #6
0
 def worker(rank, backend, q):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, q)
     assert dist.is_distributed() == True
     assert dist.get_master_ip() == _LOCALHOST
     assert dist.get_master_port() > 0
     assert dist.get_world_size() == world_size
     assert dist.get_rank() == rank
     assert dist.get_backend() == backend
예제 #7
0
def test_correctness():

    if mge.is_cuda_available():
        model_name = "mnist_model_with_test.mge"
    else:
        model_name = "mnist_model_with_test_cpu.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)
    set_conv_execution_strategy("HEURISTIC_REPRODUCIBLE")

    run_test(model_path, False, False)
    run_test(model_path, True, False)
    run_test(model_path, True, True)
예제 #8
0
 def worker(rank, q):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, q)
     dist.group_barrier()
     if rank == 0:
         dist.group_barrier()
         q.put(0)  # to be observed in rank 1
     else:
         _assert_q_empty(q)  # q.put(0) is not executed in rank 0
         dist.group_barrier()
         _assert_q_val(q, 0)  # q.put(0) executed in rank 0
예제 #9
0
def test_tensor_serialization():
    def tensor_eq(a, b):
        assert a.dtype == b.dtype
        assert a.device == b.device
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        data = np.random.randint(low=0, high=7, size=[233])
        a = Tensor(data, device="xpux", dtype=np.int32)
        pickle.dump(a, f)
        f.seek(0)
        b = pickle.load(f)
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        a = Parameter(np.random.random(size=(233, 2)).astype(np.float32))
        pickle.dump(a, f)
        f.seek(0)
        b = pickle.load(f)
        assert isinstance(b, Parameter)
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        pickle.dump(a, f)
        f.seek(0)
        b = pickle.load(f)
        assert type(b) is Tensor
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f, map_location="cpux")
        assert type(b) is Tensor
        assert "cpu" in str(b.device)
        np.testing.assert_equal(a.numpy(), b.numpy())

    with TemporaryFile() as f:
        if mge.is_cuda_available():
            device_org = mge.get_default_device()
            mge.set_default_device("gpu0")
            a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
            mge.save(a, f)
            f.seek(0)
            mge.set_default_device("cpux")
            b = mge.load(f, map_location={"gpu0": "cpu0"})
            assert type(b) is Tensor
            assert "cpu0" in str(b.device)
            np.testing.assert_equal(a.numpy(), b.numpy())
            mge.set_default_device(device_org)
예제 #10
0
    def worker(rank, data, yv_expect, running_mean, running_var):
        if not mge.is_cuda_available():
            return
        dist.init_process_group("localhost", 2333, 4, rank, rank)
        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
        data_tensor = tensor()
        for i in range(steps):
            data_tensor.set_value(data[i])
            yv = bn(data_tensor)

        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
예제 #11
0
def test_warp_affine():
    inp_shape = (1, 3, 3, 3)
    x = tensor(np.arange(27, dtype=np.float32).reshape(inp_shape))
    weightv = [[[1.26666667, 0.6, -83.33333333], [-0.33333333, 1,
                                                  66.66666667]]]
    outp = F.vision.warp_affine(x, tensor(weightv), (2, 2), border_mode="wrap")
    res = np.array(
        [[
            [[7.875, 8.875, 9.875], [8.90625, 9.90625, 10.90625]],
            [[18.75, 19.75, 20.75], [14.90625, 15.90625, 16.90625]],
        ]],
        dtype=np.float32,
    )
    if not is_cuda_available():
        np.testing.assert_almost_equal(outp.numpy(), res, 5)
예제 #12
0
def worker(master_ip, master_port, world_size, rank, dev, trace):
    import megengine.distributed as dist
    import megengine.functional as F
    from megengine import is_cuda_available
    from megengine import jit
    from megengine.module import Linear, Module
    from megengine.optimizer import SGD

    if not is_cuda_available():
        return

    class MLP(Module):
        def __init__(self):
            super().__init__()
            self.fc0 = Linear(3 * 224 * 224, 500)
            self.fc1 = Linear(500, 10)

        def forward(self, x):
            x = self.fc0(x)
            x = F.relu(x)
            x = self.fc1(x)
            return x

    dist.init_process_group(master_ip=master_ip,
                            master_port=3456,
                            world_size=world_size,
                            rank=rank,
                            dev=dev)
    net = MLP()

    opt = SGD(net.parameters(requires_grad=True), lr=0.02)

    data = np.random.random((64, 3 * 224 * 224)).astype(np.float32)
    label = np.random.randint(0, 10, size=(64, )).astype(np.int32)

    jit.trace.enabled = trace

    @jit.trace()
    def train_func(data, label):
        pred = net(data)
        loss = F.cross_entropy_with_softmax(pred, label)
        opt.backward(loss)
        return loss

    for i in range(5):
        opt.zero_grad()
        loss = train_func(data, label)
        opt.step()
예제 #13
0
 def worker(rank, q):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, q)
     dist.group_barrier()
     if rank == 0:
         func(0, q)  # q.put(0)
         q.put(2)
     else:
         _assert_q_val(q, 0)  # func executed in rank 0
         _assert_q_empty(q)  # q.put(2) is not executed
         func(1, q)
         _assert_q_val(
             q, 1
         )  # func in rank 1 executed earlier than q.put(2) in rank 0
         _assert_q_val(q, 2)  # q.put(2) executed in rank 0
예제 #14
0
파일: resnet_prof.py 프로젝트: XL2013/Docs
import json

import numpy as np

import megengine as mge
import megengine.functional as F
import megengine.hub as hub
import megengine.optimizer as optim
from megengine.jit import trace

# 使用GPU运行这个例子
assert mge.is_cuda_available(), "Please run with GPU"
# 我们从 megengine hub 中加载一个 resnet50 模型。
resnet = hub.load("megengine/models", "resnet50")

optimizer = optim.SGD(resnet.parameters(), lr=0.1,)

# profiling=True 收集性能数据
@trace(symbolic=True, profiling=True)
def train_func(data, label, *, net, optimizer):
    pred = net(data)
    loss = F.cross_entropy_with_softmax(pred, label)
    optimizer.backward(loss)


resnet.train()
batch_size = 64

# 运行 10 次,保存最后一次的性能结果
for i in range(10):
    batch_data = np.random.randn(batch_size, 3, 224, 224).astype(np.float32)
예제 #15
0
def get_xpu_name():
    if mge.is_cuda_available():
        return get_gpu_name()
    else:
        return get_cpu_name()
예제 #16
0
파일: xornet.py 프로젝트: ztjryg4/MegEngine
def main():

    if not mge.is_cuda_available():
        mge.set_default_device("cpux")

    net = XORNet()
    opt = optim.SGD(net.parameters(requires_grad=True), lr=0.01, momentum=0.9)
    batch_size = 64
    train_dataset = minibatch_generator(batch_size)
    val_dataset = minibatch_generator(batch_size)

    data = mge.tensor()
    label = mge.tensor(np.zeros((batch_size, )), dtype=np.int32)
    train_loss = []
    val_loss = []
    for step, minibatch in enumerate(train_dataset):
        if step > 1000:
            break
        data.set_value(minibatch["data"])
        label.set_value(minibatch["label"])
        opt.zero_grad()
        _, loss = train_fun(data, label, net=net, opt=opt)
        train_loss.append((step, loss.numpy()))
        if step % 50 == 0:
            minibatch = next(val_dataset)
            _, loss = val_fun(data, label, net=net)
            loss = loss.numpy()[0]
            val_loss.append((step, loss))
            print("Step: {} loss={}".format(step, loss))
        opt.step()

    test_data = np.array([
        (0.5, 0.5),
        (0.3, 0.7),
        (0.1, 0.9),
        (-0.5, -0.5),
        (-0.3, -0.7),
        (-0.9, -0.1),
        (0.5, -0.5),
        (0.3, -0.7),
        (0.9, -0.1),
        (-0.5, 0.5),
        (-0.3, 0.7),
        (-0.1, 0.9),
    ])

    data.set_value(test_data)
    out = pred_fun(data, net=net)
    pred_output = out.numpy()
    pred_label = np.argmax(pred_output, 1)

    print("Test data")
    print(test_data)

    with np.printoptions(precision=4, suppress=True):
        print("Predicated probability:")
        print(pred_output)

    print("Predicated label")
    print(pred_label)

    model_name = "xornet_deploy.mge"

    if pred_fun.enabled:
        print("Dump model as {}".format(model_name))
        pred_fun.dump(model_name, arg_names=["data"])
    else:
        print("pred_fun must be run with trace enabled in order to dump model")
예제 #17
0
    def run(
        N,
        IC,
        OC,
        IH,
        IW,
        KH,
        KW,
        PH,
        PW,
        SH,
        SW,
        has_bias=True,
        nonlinear_mode="IDENTITY",
    ):
        inp_v = np.random.normal(size=(N, IC, IH, IW))
        w_v = np.random.normal(size=(OC, IC, KW, KW))
        b_v = np.random.normal(size=(1, OC, 1, 1))
        inp_scale = mgb.dtype.get_scale(inp_dtype)
        w_scale = mgb.dtype.get_scale(w_dtype)
        b_scale = mgb.dtype.get_scale(b_dtype)

        inpv = mgb.dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype)
        wv = mgb.dtype.convert_to_qint8(w_v * w_scale, w_dtype)
        bv = mgb.dtype.convert_to_qint32(b_v * b_scale, b_dtype)

        inp_int8 = tensor(inpv, dtype=inp_dtype)
        w_int8 = Parameter(wv, dtype=w_dtype)
        b_int32 = Parameter(bv, dtype=b_dtype)

        inp_fp32 = inp_int8.astype("float32")
        w_fp32 = w_int8.astype("float32")
        b_fp32 = b_int32.astype("float32")

        jit.trace.enabled = True
        b_symbolic = True

        def convert_to_nchw4(var):
            return var.reshape(var.shapeof(0),
                               var.shapeof(1) // 4, 4, var.shapeof(2),
                               var.shapeof(3)).dimshuffle(0, 1, 3, 4, 2)

        @jit.trace(symbolic=b_symbolic)
        def run_conv2d(inp, w, b):
            O = F.conv2d(
                inp,
                w,
                b if has_bias else None,
                stride=(SH, SW),
                padding=(PH, PW),
            )
            if nonlinear_mode == "RELU":
                return F.relu(O)
            else:
                return O

        @jit.trace(symbolic=b_symbolic)
        def run_conv_bias(inp, w, b, format="NCHW"):
            b = b if has_bias else np.zeros_like(b)
            if format == "NCHW4":
                inp = convert_to_nchw4(inp)
                w = convert_to_nchw4(w)
                b = F.flatten(b)
            return F.conv_bias_activation(
                inp,
                w,
                b,
                stride=(SH, SW),
                padding=(PH, PW),
                dtype=out_dtype,
                nonlinear_mode=nonlinear_mode,
            )

        format = "NCHW4" if is_cuda_available() else "NCHW"

        expected = run_conv2d(inp_fp32, w_fp32, b_fp32)
        expected = expected.astype(out_dtype).astype("float32")
        result = run_conv_bias(inp_int8, w_int8, b_int32,
                               format=format).astype("float32")
        if format == "NCHW4":
            result = result.dimshuffle(0, 1, 4, 2, 3)
        expected = F.flatten(expected)
        result = F.flatten(result)
        assertTensorClose(result.numpy(), expected.numpy())
예제 #18
0
    def run(
        N,
        IC,
        OC,
        IH,
        IW,
        KH,
        KW,
        PH,
        PW,
        SH,
        SW,
        has_bias=True,
        nonlinear_mode="identity",
    ):
        inp_v = np.random.normal(size=(N, IC, IH, IW))
        w_v = np.random.normal(size=(OC, IC, KH, KW))
        b_v = np.random.normal(size=(1, OC, 1, 1))
        inp_scale = dtype.get_scale(inp_dtype)
        w_scale = dtype.get_scale(w_dtype)
        b_scale = dtype.get_scale(b_dtype)

        inpv = dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype)
        wv = dtype.convert_to_qint8(w_v * w_scale, w_dtype)
        bv = dtype.convert_to_qint32(b_v * b_scale, b_dtype)

        inp_int8 = tensor(inpv, dtype=inp_dtype)
        w_int8 = Parameter(wv, dtype=w_dtype)
        b_int32 = Parameter(bv, dtype=b_dtype)

        inp_fp32 = inp_int8.astype("float32")
        w_fp32 = w_int8.astype("float32")
        b_fp32 = b_int32.astype("float32")

        def convert_to_nchw4(var):
            var = F.reshape(var, (var.shape[0], var.shape[1] // 4, 4,
                                  var.shape[2], var.shape[3]))
            var = F.transpose(var, (0, 1, 3, 4, 2))
            return var

        def run_conv2d(inp, w, b):
            O = F.conv2d(
                inp,
                w,
                b if has_bias else None,
                stride=(SH, SW),
                padding=(PH, PW),
            )
            if nonlinear_mode == "relu":
                return F.relu(O)
            else:
                return O

        def run_conv_bias(inp, w, b, format="NCHW"):
            b = b if has_bias else Parameter(np.zeros_like(b.numpy()))
            if format == "NCHW4":
                inp = convert_to_nchw4(inp)
                w = convert_to_nchw4(w)
                b = convert_to_nchw4(b)
            return F.quantized.conv_bias_activation(
                inp,
                w,
                b,
                stride=(SH, SW),
                padding=(PH, PW),
                dtype=out_dtype,
                nonlinear_mode=nonlinear_mode,
            )

        format = "NCHW4" if is_cuda_available() else "NCHW"

        expected = run_conv2d(inp_fp32, w_fp32, b_fp32)
        expected = expected.astype(out_dtype).astype("float32")
        result = run_conv_bias(inp_int8, w_int8, b_int32,
                               format=format).astype("float32")
        if format == "NCHW4":
            result = F.transpose(result, (0, 1, 4, 2, 3))
        expected = F.flatten(expected)
        result = F.flatten(result)
        np.testing.assert_allclose(result.numpy(),
                                   expected.numpy(),
                                   atol=outp_scale)
예제 #19
0
파일: xornet.py 프로젝트: mozre/MegEngine
def main():

    if not mge.is_cuda_available():
        mge.set_default_device("cpux")

    net = XORNet()
    gm = ad.GradManager().attach(net.parameters())
    opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    batch_size = 64
    train_dataset = minibatch_generator(batch_size)
    val_dataset = minibatch_generator(batch_size)

    def train_fun(data, label):
        opt.clear_grad()
        with gm:
            pred = net(data)
            loss = F.loss.cross_entropy(pred, label)
            gm.backward(loss)
        opt.step()
        return pred, loss

    def val_fun(data, label):
        pred = net(data)
        loss = F.loss.cross_entropy(pred, label)
        return pred, loss

    @trace(symbolic=True, capture_as_const=True)
    def pred_fun(data):
        pred = net(data)
        pred_normalized = F.softmax(pred)
        return pred_normalized

    data = np.random.random((batch_size, 2)).astype(np.float32)
    label = np.zeros((batch_size,)).astype(np.int32)
    train_loss = []
    val_loss = []
    for step, minibatch in enumerate(train_dataset):
        if step > 1000:
            break
        data = mge.tensor(minibatch["data"])
        label = mge.tensor(minibatch["label"])
        net.train()
        _, loss = train_fun(data, label)
        train_loss.append((step, loss.numpy()))
        if step % 50 == 0:
            minibatch = next(val_dataset)
            net.eval()
            _, loss = val_fun(data, label)
            loss = loss.numpy()
            val_loss.append((step, loss))
            print("Step: {} loss={}".format(step, loss))
        opt.step()

    test_data = np.array(
        [
            (0.5, 0.5),
            (0.3, 0.7),
            (0.1, 0.9),
            (-0.5, -0.5),
            (-0.3, -0.7),
            (-0.9, -0.1),
            (0.5, -0.5),
            (0.3, -0.7),
            (0.9, -0.1),
            (-0.5, 0.5),
            (-0.3, 0.7),
            (-0.1, 0.9),
        ]
    )

    # tracing only accepts tensor as input
    data = mge.tensor(test_data, dtype=np.float32)
    net.eval()
    out = pred_fun(data)
    pred_output = out.numpy()
    pred_label = np.argmax(pred_output, 1)

    print("Test data")
    print(test_data)

    with np.printoptions(precision=4, suppress=True):
        print("Predicated probability:")
        print(pred_output)

    print("Predicated label")
    print(pred_label)

    model_name = "xornet_deploy.mge"

    print("Dump model as {}".format(model_name))
    pred_fun.dump(model_name, arg_names=["data"])

    model_with_testcase_name = "xornet_with_testcase.mge"

    print("Dump model with testcase as {}".format(model_with_testcase_name))
    pred_fun.dump(model_with_testcase_name, arg_names=["data"], input_data=["#rand(0.1, 0.8, 4, 2)"])