Exemplo n.º 1
0
def test_auto_methods_gloo(distributed_context_single_node_gloo):

    ws = distributed_context_single_node_gloo["world_size"]
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2)
    _test_auto_dataloader(ws=ws,
                          nproc=ws,
                          batch_size=10,
                          sampler_name="WeightedRandomSampler")
    _test_auto_dataloader(ws=ws,
                          nproc=ws,
                          batch_size=10,
                          sampler_name="DistributedSampler")

    device = idist.device()
    _test_auto_model_optimizer(ws, device)

    if ws > 1 and device.type == "cpu":
        error_type = AssertionError if LooseVersion(
            torch.__version__) <= LooseVersion("1.9.0") else ValueError
        with pytest.raises(
                error_type,
                match=r"SyncBatchNorm layers only work with GPU modules"):
            model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100))
            auto_model(model, sync_bn=True)
Exemplo n.º 2
0
def test_auto_methods_gloo(distributed_context_single_node_gloo):

    ws = distributed_context_single_node_gloo["world_size"]
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2)
    _test_auto_dataloader(ws=ws,
                          nproc=ws,
                          batch_size=10,
                          sampler_name="WeightedRandomSampler")
    _test_auto_dataloader(ws=ws,
                          nproc=ws,
                          batch_size=10,
                          sampler_name="DistributedSampler")

    device = idist.device()
    _test_auto_model_optimizer(ws, device)

    if ws > 1 and device.type == "cpu":
        # Pytorch <= 1.9.0 => AssertionError
        # Pytorch >  1.9   => ValueError
        # https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py#L1498
        with pytest.raises(
            (AssertionError, ValueError),
                match=r"SyncBatchNorm layers only work with GPU modules"):
            model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100))
            auto_model(model, sync_bn=True)
Exemplo n.º 3
0
def test_auto_methods_nccl(distributed_context_single_node_nccl):

    ws = distributed_context_single_node_nccl["world_size"]
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10)
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1, sampler_name="WeightedRandomSampler")

    device = idist.device()
    _test_auto_model_optimizer(ws, device)

    if ws > 1:
        with pytest.raises(ValueError, match=r"Argument kwargs should not contain 'device_ids'"):
            auto_model(nn.Linear(1, 1), device_ids=[0])
Exemplo n.º 4
0
def _test_auto_model_optimizer(ws, device):
    # Test auto_model
    model = nn.Linear(10, 10)
    model = auto_model(model)
    bnd = idist.backend()
    if ws > 1 and device in ("cuda", "cpu"):
        if idist.has_native_dist_support and bnd in ("nccl" or "gloo"):
            assert isinstance(model, nn.parallel.DistributedDataParallel)
        elif idist.has_hvd_support and bnd in ("horovod", ):
            assert isinstance(model, nn.Module)
    elif device != "cpu" and torch.cuda.is_available(
    ) and torch.cuda.device_count() > 1:
        assert isinstance(model, nn.parallel.DataParallel)
    else:
        assert isinstance(model, nn.Module)

    assert all([p.device.type == device
                for p in model.parameters()]), "{} vs {}".format(
                    [p.device.type for p in model.parameters()], device)

    # Test auto_optim
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    optimizer = auto_optim(optimizer)
    if idist.has_xla_support and "xla" in device:
        assert isinstance(optimizer, optim.SGD) and hasattr(
            optimizer, "wrapped_optimizer")
    elif idist.has_hvd_support and bnd in ("horovod", ):
        assert isinstance(optimizer, optim.SGD) and hasattr(
            optimizer, "_allreduce_grad_async")
    else:
        assert isinstance(optimizer, optim.SGD) and not hasattr(
            optimizer, "wrapped_optimizer")
Exemplo n.º 5
0
def _test_auto_model(model, ws, device, sync_bn=False, **kwargs):
    model = auto_model(model, sync_bn=sync_bn, **kwargs)
    bnd = idist.backend()
    if ws > 1 and torch.device(device).type in ("cuda", "cpu"):
        if idist.has_native_dist_support and bnd in ("nccl", "gloo"):
            assert isinstance(model, nn.parallel.DistributedDataParallel)
            if sync_bn:
                assert any(
                    [isinstance(m, nn.SyncBatchNorm) for m in model.modules()])
            if "find_unused_parameters" in kwargs:
                assert model.find_unused_parameters == kwargs[
                    "find_unused_parameters"]
        elif idist.has_hvd_support and bnd in ("horovod", ):
            assert isinstance(model, nn.Module)
    elif device != "cpu" and torch.cuda.is_available(
    ) and torch.cuda.device_count() > 1:
        assert isinstance(model, nn.parallel.DataParallel)
    else:
        assert isinstance(model, nn.Module)

    assert all(
        [
            p.device.type == torch.device(device).type
            for p in model.parameters()
        ]
    ), f"{[p.device.type for p in model.parameters()]} vs {torch.device(device).type}"
Exemplo n.º 6
0
def _test_auto_model_optimizer(ws, device):
    # Test auto_model
    model = nn.Linear(10, 10)
    model = auto_model(model)
    if ws > 1:
        assert isinstance(model, nn.parallel.DistributedDataParallel)
    elif device != "cpu" and torch.cuda.is_available(
    ) and torch.cuda.device_count() > 1:
        assert isinstance(model, nn.parallel.DataParallel)
    else:
        assert isinstance(model, nn.Module)

    assert all([p.device.type == device
                for p in model.parameters()]), "{} vs {}".format(
                    [p.device.type for p in model.parameters()], device)

    # Test auto_optim
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    optimizer = auto_optim(optimizer)
    if "xla" in device:
        assert isinstance(optimizer, optim.SGD) and hasattr(
            optimizer, "wrapped_optimizer")
    else:
        assert isinstance(optimizer, optim.SGD) and not hasattr(
            optimizer, "wrapped_optimizer")
Exemplo n.º 7
0
def test_auto_methods_gloo(distributed_context_single_node_gloo):

    ws = distributed_context_single_node_gloo["world_size"]
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
    _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2)
    _test_auto_dataloader(ws=ws,
                          nproc=ws,
                          batch_size=10,
                          sampler_name="WeightedRandomSampler")

    _test_auto_model_optimizer(ws, "cpu")

    if ws > 1:
        with pytest.raises(
                AssertionError,
                match=r"SyncBatchNorm layers only work with GPU modules"):
            model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100))
            auto_model(model, sync_bn=True)
Exemplo n.º 8
0
def _test_auto_model(model, ws, device, sync_bn=False):
    model = auto_model(model, sync_bn=sync_bn)
    bnd = idist.backend()
    if ws > 1 and device in ("cuda", "cpu"):
        if idist.has_native_dist_support and bnd in ("nccl" or "gloo"):
            assert isinstance(model, nn.parallel.DistributedDataParallel)
            if sync_bn:
                assert any(
                    [isinstance(m, nn.SyncBatchNorm) for m in model.modules()])
        elif idist.has_hvd_support and bnd in ("horovod", ):
            assert isinstance(model, nn.Module)
    elif device != "cpu" and torch.cuda.is_available(
    ) and torch.cuda.device_count() > 1:
        assert isinstance(model, nn.parallel.DataParallel)
    else:
        assert isinstance(model, nn.Module)

    assert all([p.device.type == device
                for p in model.parameters()]), "{} vs {}".format(
                    [p.device.type for p in model.parameters()], device)