def _test_func(rank, world_size, tempfile_name, unused): result = dist_init(rank, world_size, tempfile_name, unused) assert result, "Dist init failed" # Keep initialization deterministic. torch.manual_seed(0) model = FullyShardedDataParallel(SimpleModuleWithCheckpointing().cuda()) optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # Collect parameter sizes to ensure these stay consistent through the steps below. expected_param_shapes = { name: tuple(param.shape) for name, param in model.named_parameters() } # For clarity, this is what `expected_param_shapes` should look like depending on world size: assert expected_param_shapes == { "_fsdp_wrapped_module.flat_param_0": (12, ), "_fsdp_wrapped_module._fpw_module.ffn.1._fsdp_wrapped_module.flat_param_0": (6, ), }, expected_param_shapes torch.manual_seed(1 + rank) # Train for a step. _train_step(model, optim, expected_param_shapes) # Now do an eval step. _eval_step(model, optim, expected_param_shapes) # And finally do another train step. _train_step(model, optim, expected_param_shapes) teardown()
def _test_func(rank, world_size, tempfile_name, unused, flatten, mixed_precision, amp_context, half_input, fsdp_wrap_ckpt): result = dist_init(rank, world_size, tempfile_name, unused) assert result, "Dist init failed" # Keep initialization deterministic. torch.manual_seed(0) model = FSDP( SimpleModuleWithCheckpointing(flatten, mixed_precision, fsdp_wrap_ckpt).cuda(), flatten_parameters=flatten, mixed_precision=mixed_precision, ) optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # Collect parameter sizes to ensure these stay consistent through the steps below. expected_param_shapes = { name: tuple(param.shape) for name, param in model.named_parameters() } # For clarity, this is what `expected_param_shapes` should look like depending on world size: if not flatten: assert expected_param_shapes == { "ffn.0.weight": (5, ), "ffn.0.bias": (2, ), "ffn.1.weight": (5, ), "ffn.1.bias": (2, ), "ffn.2.weight": (5, ), "ffn.2.bias": (2, ), } else: assert expected_param_shapes == { "_fsdp_wrapped_module.flat_param_0": (12, ), "_fsdp_wrapped_module._fpw_module.ffn.1._fsdp_wrapped_module.flat_param_0": (6, ), }, expected_param_shapes torch.manual_seed(1 + rank) # Train for a step. _train_step(model, optim, expected_param_shapes, amp_context, mixed_precision, half_input) # Now do an eval step. _eval_step(model, optim, expected_param_shapes, amp_context, mixed_precision, half_input) # And finally do another train step. _train_step(model, optim, expected_param_shapes, amp_context, mixed_precision, half_input) teardown()
def _test_named_params(self, rank, group, config): # Get the named parameters before wrapping. before_wrap_model = TransformerWithSharedParams(group) before_wrap_params = before_wrap_model.named_parameters() with tempfile.TemporaryDirectory() as current_tempdir: if config["ssd_offload"]: config["offload_config"] = OffloadConfig( offload_type="ssd_offload", ssd_filepath_dir=current_tempdir) del config["ssd_offload"] model = FullyShardedDataParallel(before_wrap_model, **config) print(f"model.ssd_offload {model.ssd_offload}") if not model.ssd_offload and not model.move_params_to_cpu: model = model.cuda() self._eval_with_config(model, autocast=config["mixed_precision"]) # Get the named parameters after wrapping to compare. after_wrap_params = model.named_parameters() if not config.get("flatten_parameters", False): for before_nm, after_nm in zip(before_wrap_params, after_wrap_params): assert before_nm[0] == after_nm[0] else: named_params_flat = [p for p in after_wrap_params][0][0] assert "flat_param_0" in named_params_flat after_wrap_params = model.named_parameters() for before_nm, after_nm_original in zip(before_wrap_params, after_wrap_params): assert before_nm[0] == after_nm_original[0] torch.testing.assert_allclose(before_nm[1].shape, after_nm_original[1].shape)
def _distributed_worker( gpu_id: int, with_fsdp: bool, sync_file: str, result_file: str ): torch.cuda.set_device(gpu_id) dist.init_process_group( backend="nccl", init_method="file://" + sync_file, world_size=2, rank=gpu_id ) # Create the inputs torch.manual_seed(0) torch.backends.cudnn.deterministic = True batch = torch.randn(size=(8, 3, 224, 224)).cuda() # Create a fake model based on SWAV blocks config = TestRegnetFSDP._create_config(with_fsdp) model = build_model(config["MODEL"], config["OPTIMIZER"]) model = model.cuda() if with_fsdp: model = FSDP(model) else: model = DistributedDataParallel(model, device_ids=[gpu_id]) criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"]) optimizer = optim.SGD(model.parameters(), lr=1e-2) # Run a few iterations and collect the losses losses = [] for iteration in range(5): out = model(batch) loss = criterion(out[0], torch.tensor(0.0).cuda()) if gpu_id == 0: losses.append(loss.item()) optimizer.zero_grad() loss.backward() if iteration <= 2: for name, param in model.named_parameters(): if "prototypes" in name: param.grad = None optimizer.step() # Store the losses in a file to compare several methods if gpu_id == 0: with open(result_file, "wb") as f: pickle.dump(losses, f)