def test_sync_min_max_observer(): word_size = get_device_count_by_fork("gpu") x = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() @dist.launcher def worker(): rank = dist.get_rank() m = SyncMinMaxObserver() y = mge.tensor(x[rank * 3:(rank + 1) * 3]) m(y) assert m.min_val == np_min and m.max_val == np_max worker()
def test_sync_exponential_moving_average_observer(): word_size = get_device_count_by_fork("gpu") t = np.random.rand() x1 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") x2 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") expected_min = x1.min() * t + x2.min() * (1 - t) expected_max = x1.max() * t + x2.max() * (1 - t) @dist.launcher def worker(): rank = dist.get_rank() m = SyncExponentialMovingAverageObserver(momentum=t) y1 = mge.tensor(x1[rank * 3:(rank + 1) * 3]) y2 = mge.tensor(x2[rank * 3:(rank + 1) * 3]) m(y1) m(y2) np.testing.assert_allclose(m.min_val.numpy(), expected_min, atol=1e-6) np.testing.assert_allclose(m.max_val.numpy(), expected_max, atol=1e-6) worker()
def as_tensor(x): return mge.Tensor(x) def save_to(self, name="grad"): def callback(grad): setattr(self, name, grad) return callback @pytest.mark.skipif(platform.system() == "Darwin", reason="do not imp GPU mode at macos now") @pytest.mark.skipif(platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM") @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") @pytest.mark.isolated_distributed def test_dist_grad(): world_size = 2 x_np = np.random.rand(10).astype("float32") @dist.launcher def worker(): rank = dist.get_rank() if rank == 0: grad = Grad() x = as_tensor(x_np) grad.wrt(x, callback=save_to(x)) # need a placeholder to trace operator
import platform import sys import pytest import megengine.functional import megengine.module from megengine import Parameter from megengine.core._imperative_rt.core2 import sync from megengine.distributed.helper import get_device_count_by_fork from megengine.jit import trace as _trace from megengine.module import Linear, Module sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) _ngpu = get_device_count_by_fork("gpu") @pytest.fixture(autouse=True) def skip_by_ngpu(request): if request.node.get_closest_marker("require_ngpu"): require_ngpu = int( request.node.get_closest_marker("require_ngpu").args[0]) if require_ngpu > _ngpu: pytest.skip( "skipped for ngpu unsatisfied: {}".format(require_ngpu)) @pytest.fixture(autouse=True) def skip_distributed(request): if request.node.get_closest_marker("distributed_isolated"):
no_trans, part_size, pooled_h, pooled_w, sample_per_part, spatial_scale, trans_std, ) return y result = fwd(inp, rois, trans) check_pygraph_dump(fwd, [inp, rois, trans], [result]) @pytest.mark.skipif( get_device_count_by_fork("gpu") > 0, reason="does not support int8 when gpu compute capability less than 6.1", ) def test_convbias(): @trace(symbolic=True, capture_as_const=True) def fwd(inp, weight, bias): return F.quantized.conv_bias_activation(inp, weight, bias, dtype=dtype.qint8(scale=1.0), nonlinear_mode="relu") inp = Tensor(np.random.random((1, 3, 64, 64)), dtype=dtype.qint8(scale=1.0)) weight = Tensor(np.random.random((32, 3, 3, 3)), dtype=dtype.qint8(scale=1.0))
def test_min_max_observer(): x = np.random.rand(3, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() x = mge.tensor(x) m = ob.MinMaxObserver() m(x) assert m.min_val == np_min and m.max_val == np_max @pytest.mark.skipif( platform.system() == "Darwin", reason="do not imp GPU mode at macos now" ) @pytest.mark.skipif( platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM" ) @pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device") @pytest.mark.isolated_distributed def test_sync_min_max_observer(): x = np.random.rand(6, 3, 3, 3).astype("float32") np_min, np_max = x.min(), x.max() world_size = 2 port = dist.get_free_ports(1)[0] server = dist.Server(port) def worker(rank, slc): dist.init_process_group("localhost", port, world_size, rank, rank) m = ob.SyncMinMaxObserver() y = mge.tensor(x[slc]) m(y) assert m.min_val == np_min and m.max_val == np_max
from megengine import is_cuda_available, tensor from megengine.core._imperative_rt import CompNode from megengine.core._imperative_rt.core2 import apply from megengine.core._imperative_rt.ops import ( delete_rng_handle, get_global_rng_seed, new_rng_handle, ) from megengine.core.ops.builtin import GaussianRNG, UniformRNG from megengine.distributed.helper import get_device_count_by_fork from megengine.random import RNG from megengine.random.rng import _normal, _uniform @pytest.mark.skipif( get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", ) def test_gaussian_op(): shape = ( 8, 9, 11, 12, ) shape = tensor(shape, dtype="int32") op = GaussianRNG(seed=get_global_rng_seed(), mean=1.0, std=3.0) (output, ) = apply(op, shape) assert np.fabs(output.numpy().mean() - 1.0) < 1e-1 assert np.sqrt(output.numpy().var()) - 3.0 < 1e-1 assert str(output.device) == str(CompNode("xpux"))
net.train() qat_net = quantize_qat(net, inplace=False) disable_fake_quant(qat_net) normal_outputs = net(inputs) qat_outputs = qat_net(inputs) np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy()) net.eval() normal_outputs = net(inputs) qat_net.eval() qat_outputs = qat_net(inputs) np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy()) @pytest.mark.skipif( get_device_count_by_fork("gpu") > 0, reason="no int8 algorithm on cuda" ) def test_qat_batchmatmul_activation(): batch = 4 in_features = 8 out_features = 4 class TestNet(Module): def __init__(self, bias): super().__init__() self.quant = QuantStub() self.dequant = DequantStub() self.batch_mm = BatchMatMulActivation( batch, in_features, out_features, bias=bias )