예제 #1
0
def _test_func(rank, world_size, tempfile_name, unused):
    result = dist_init(rank, world_size, tempfile_name, unused)
    assert result, "Dist init failed"

    # Keep initialization deterministic.
    torch.manual_seed(0)

    model = FullyShardedDataParallel(SimpleModuleWithCheckpointing().cuda())
    optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    # Collect parameter sizes to ensure these stay consistent through the steps below.
    expected_param_shapes = {
        name: tuple(param.shape)
        for name, param in model.named_parameters()
    }

    # For clarity, this is what `expected_param_shapes` should look like depending on world size:
    assert expected_param_shapes == {
        "_fsdp_wrapped_module.flat_param_0": (12, ),
        "_fsdp_wrapped_module._fpw_module.ffn.1._fsdp_wrapped_module.flat_param_0":
        (6, ),
    }, expected_param_shapes

    torch.manual_seed(1 + rank)

    # Train for a step.
    _train_step(model, optim, expected_param_shapes)

    # Now do an eval step.
    _eval_step(model, optim, expected_param_shapes)

    # And finally do another train step.
    _train_step(model, optim, expected_param_shapes)

    teardown()
def _test_func(rank, world_size, tempfile_name, unused, flatten,
               mixed_precision, amp_context, half_input, fsdp_wrap_ckpt):
    result = dist_init(rank, world_size, tempfile_name, unused)
    assert result, "Dist init failed"

    # Keep initialization deterministic.
    torch.manual_seed(0)

    model = FSDP(
        SimpleModuleWithCheckpointing(flatten, mixed_precision,
                                      fsdp_wrap_ckpt).cuda(),
        flatten_parameters=flatten,
        mixed_precision=mixed_precision,
    )
    optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    # Collect parameter sizes to ensure these stay consistent through the steps below.
    expected_param_shapes = {
        name: tuple(param.shape)
        for name, param in model.named_parameters()
    }

    # For clarity, this is what `expected_param_shapes` should look like depending on world size:
    if not flatten:
        assert expected_param_shapes == {
            "ffn.0.weight": (5, ),
            "ffn.0.bias": (2, ),
            "ffn.1.weight": (5, ),
            "ffn.1.bias": (2, ),
            "ffn.2.weight": (5, ),
            "ffn.2.bias": (2, ),
        }
    else:
        assert expected_param_shapes == {
            "_fsdp_wrapped_module.flat_param_0": (12, ),
            "_fsdp_wrapped_module._fpw_module.ffn.1._fsdp_wrapped_module.flat_param_0":
            (6, ),
        }, expected_param_shapes

    torch.manual_seed(1 + rank)

    # Train for a step.
    _train_step(model, optim, expected_param_shapes, amp_context,
                mixed_precision, half_input)

    # Now do an eval step.
    _eval_step(model, optim, expected_param_shapes, amp_context,
               mixed_precision, half_input)

    # And finally do another train step.
    _train_step(model, optim, expected_param_shapes, amp_context,
                mixed_precision, half_input)

    teardown()
예제 #3
0
    def _test_named_params(self, rank, group, config):
        # Get the named parameters before wrapping.
        before_wrap_model = TransformerWithSharedParams(group)
        before_wrap_params = before_wrap_model.named_parameters()

        with tempfile.TemporaryDirectory() as current_tempdir:
            if config["ssd_offload"]:
                config["offload_config"] = OffloadConfig(
                    offload_type="ssd_offload",
                    ssd_filepath_dir=current_tempdir)
            del config["ssd_offload"]

            model = FullyShardedDataParallel(before_wrap_model, **config)
            print(f"model.ssd_offload {model.ssd_offload}")
            if not model.ssd_offload and not model.move_params_to_cpu:
                model = model.cuda()

            self._eval_with_config(model, autocast=config["mixed_precision"])

            # Get the named parameters after wrapping to compare.
            after_wrap_params = model.named_parameters()

            if not config.get("flatten_parameters", False):
                for before_nm, after_nm in zip(before_wrap_params,
                                               after_wrap_params):
                    assert before_nm[0] == after_nm[0]
            else:
                named_params_flat = [p for p in after_wrap_params][0][0]
                assert "flat_param_0" in named_params_flat

            after_wrap_params = model.named_parameters()

            for before_nm, after_nm_original in zip(before_wrap_params,
                                                    after_wrap_params):
                assert before_nm[0] == after_nm_original[0]
                torch.testing.assert_allclose(before_nm[1].shape,
                                              after_nm_original[1].shape)
예제 #4
0
    def _distributed_worker(
        gpu_id: int, with_fsdp: bool, sync_file: str, result_file: str
    ):
        torch.cuda.set_device(gpu_id)
        dist.init_process_group(
            backend="nccl", init_method="file://" + sync_file, world_size=2, rank=gpu_id
        )

        # Create the inputs
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True
        batch = torch.randn(size=(8, 3, 224, 224)).cuda()

        # Create a fake model based on SWAV blocks
        config = TestRegnetFSDP._create_config(with_fsdp)
        model = build_model(config["MODEL"], config["OPTIMIZER"])
        model = model.cuda()
        if with_fsdp:
            model = FSDP(model)
        else:
            model = DistributedDataParallel(model, device_ids=[gpu_id])
        criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"])
        optimizer = optim.SGD(model.parameters(), lr=1e-2)

        # Run a few iterations and collect the losses
        losses = []
        for iteration in range(5):
            out = model(batch)
            loss = criterion(out[0], torch.tensor(0.0).cuda())
            if gpu_id == 0:
                losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if iteration <= 2:
                for name, param in model.named_parameters():
                    if "prototypes" in name:
                        param.grad = None
            optimizer.step()

        # Store the losses in a file to compare several methods
        if gpu_id == 0:
            with open(result_file, "wb") as f:
                pickle.dump(losses, f)