예제 #1
0
 def worker_fn(gpu_id: int, world_size: int, batch_size: int,
               sync_file: str):
     init_distributed_on_file(world_size=world_size,
                              gpu_id=gpu_id,
                              sync_file=sync_file)
     embeddings = torch.full(size=(batch_size, 3),
                             fill_value=float(gpu_id),
                             requires_grad=True).cuda(gpu_id)
     gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings)
     if world_size == 1:
         assert gathered.equal(
             torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
                          device=f"cuda:{gpu_id}"))
     if world_size == 2:
         assert gathered.equal(
             torch.tensor(
                 [
                     [0.0, 0.0, 0.0],
                     [0.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0],
                     [1.0, 1.0, 1.0],
                 ],
                 device=f"cuda:{gpu_id}",
             ))
     assert gathered.requires_grad
예제 #2
0
 def worker_fn(gpu_id: int, world_size: int, batch_size: int,
               sync_file: str):
     init_distributed_on_file(world_size=world_size,
                              gpu_id=gpu_id,
                              sync_file=sync_file)
     EMBEDDING_DIM = 128
     criterion = BarlowTwinsCriterion(lambda_=0.0051,
                                      scale_loss=0.024,
                                      embedding_dim=EMBEDDING_DIM)
     embeddings = torch.randn((batch_size, EMBEDDING_DIM),
                              dtype=torch.float32,
                              requires_grad=True).cuda()
     criterion(embeddings).backward()
예제 #3
0
    def _pretraining_worker(
        gpu_id: int,
        with_fsdp: bool,
        with_activation_checkpointing: bool,
        with_larc: bool,
        sync_file: str,
        result_file: str,
    ):
        init_distributed_on_file(world_size=2,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True

        # Create the inputs
        batch = torch.randn(size=(8, 3, 224, 224)).cuda()
        target = torch.tensor(0.0).cuda()

        # Create a fake model based on SWAV blocks
        config = TestRegnetFSDP._create_pretraining_config(
            with_fsdp, with_activation_checkpointing, with_larc=with_larc)
        model = build_model(config["MODEL"], config["OPTIMIZER"])
        model = model.cuda()
        if with_fsdp:
            model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG)
        else:
            model = DistributedDataParallel(model, device_ids=[gpu_id])
        criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"])
        optimizer = build_optimizer(config["OPTIMIZER"])
        optimizer.set_param_groups(model.parameters())

        # Run a few iterations and collect the losses
        losses = []
        num_iterations = 5
        for iteration in range(num_iterations):
            out = model(batch)
            loss = criterion(out[0], target)
            if gpu_id == 0:
                losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if iteration <= 2:
                for name, param in model.named_parameters():
                    if "prototypes" in name:
                        param.grad = None
            optimizer.step(where=float(iteration / num_iterations))

        # Store the losses in a file to compare several methods
        if gpu_id == 0:
            with open(result_file, "wb") as f:
                pickle.dump(losses, f)
예제 #4
0
    def _test_synch_bn_pytorch_worker(gpu_id: int, world_size: int,
                                      group_size: int, sync_file: str):
        torch.cuda.set_device(gpu_id)
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)

        config = AttrDict({
            "MODEL": {
                "SYNC_BN_CONFIG": {
                    "SYNC_BN_TYPE": "pytorch",
                    "GROUP_SIZE": group_size,
                }
            },
            "DISTRIBUTED": {
                "NUM_PROC_PER_NODE": world_size,
                "NUM_NODES": 1,
                "NCCL_DEBUG": False,
                "NCCL_SOCKET_NTHREADS": 4,
            },
        })
        set_env_vars(local_rank=gpu_id, node_id=0, cfg=config)

        channels = 8
        model = nn.Sequential(
            nn.BatchNorm2d(num_features=channels),
            nn.AdaptiveAvgPool2d(output_size=(1, 1)),
        )
        model = convert_sync_bn(config, model).cuda(gpu_id)
        model = DistributedDataParallel(model, device_ids=[gpu_id])
        x = torch.full(size=(5, channels, 4, 4), fill_value=float(gpu_id))
        model(x)
        running_mean = model.module[0].running_mean.cpu()
        print(gpu_id, running_mean)
        if group_size == 1:
            if gpu_id == 0:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.0))
            elif gpu_id == 1:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.1))
        else:
            if gpu_id in {0, 1}:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.05))
    def _worker(gpu_id: int, sync_file: str, world_size: int):
        torch.manual_seed(0)
        os.environ["RANK"] = str(gpu_id)
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.backends.cudnn.deterministic = True

        config = TestCheckpointConversion._create_fsdp_model_config(
            with_fsdp=True)
        model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG)
        optimizer = optim.SGD(model.parameters(), lr=1e-4)

        # Fake inputs
        num_iterations = 5
        batch_size = 3
        torch.manual_seed(gpu_id)
        fake_inputs = torch.randn(size=(num_iterations, batch_size, 3, 96, 96))
        fake_targets = torch.randn(size=(num_iterations, batch_size))

        # Fake training loop
        criterion = nn.MSELoss()
        for iteration in range(num_iterations):
            fake_input = fake_inputs[iteration].cuda(gpu_id)
            fake_target = fake_targets[iteration].cuda(gpu_id)
            output1, output2 = model(fake_input)[0]
            loss = criterion(output1.sum(axis=-1), fake_target) + criterion(
                output2.sum(axis=-1), fake_target)
            if gpu_id == 0:
                print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Save a bunch of checkpoint, one by shard
        checkpoint_writer = CheckpointWriter(
            checkpoint_folder=".",
            is_final_train_phase=True,
            mode="iteration",
            mode_num=0,
            backend="disk",
        )
        content = {
            "classy_state_dict": {
                "base_model": {
                    "model": {
                        "trunk": model.trunk.local_state_dict()
                    },
                    "meta": {
                        "trunk": model.trunk.local_metadata_dict()
                    },
                }
            }
        }
        checkpoint_writer.save_sharded_checkpoint(content,
                                                  shard_rank=gpu_id,
                                                  world_size=world_size)
        dist.barrier()
        print(os.listdir("."))

        # Convert the checkpoint to consolidated and sliced checkpoints
        if gpu_id == 0:
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
        dist.barrier()
        print(os.listdir("."))

        # Now create models initialized from the previous checkpoint and compare them
        fake_test_input = torch.randn(size=(1, 3, 96, 96)).cuda(gpu_id)

        shard_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint.torch", device=torch.device("cpu"))
        shard_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        shard_model = fsdp_wrapper(shard_model, **config.MODEL.FSDP_CONFIG)
        shard_model.init_model_from_weights_params_file(config, shard_cp)

        conso_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_conso.torch", device=torch.device("cpu"))
        conso_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        conso_model = fsdp_wrapper(conso_model, **config.MODEL.FSDP_CONFIG)
        conso_model.init_model_from_weights_params_file(config, conso_cp)

        slice_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_sliced.torch", device=torch.device("cpu"))
        slice_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        slice_model = fsdp_wrapper(slice_model, **config.MODEL.FSDP_CONFIG)
        slice_model.init_model_from_weights_params_file(config, slice_cp)

        # Verifying that the models are equivalent
        if gpu_id == 0:
            slice_state_dict = slice_model.local_state_dict()
            conso_state_dict = conso_model.local_state_dict()
            assert set(slice_state_dict.keys()) == set(conso_state_dict.keys())
            for k in slice_state_dict.keys():
                slice_val = slice_state_dict[k]
                conso_val = conso_state_dict[k]
                assert torch.allclose(
                    slice_val, conso_val
                ), f"Difference for key {k}: {slice_val} VS {conso_val}"
        dist.barrier()

        with torch.no_grad():
            ref_out = model.trunk(fake_test_input)[0]
            shard_out = shard_model.trunk(fake_test_input)[0]
            conso_out = conso_model.trunk(fake_test_input)[0]
            slice_out = slice_model.trunk(fake_test_input)[0]
            assert torch.allclose(
                ref_out, shard_out), f"{ref_out.sum()} vs {shard_out.sum()}"
            assert torch.allclose(
                ref_out, conso_out), f"{ref_out.sum()} vs {conso_out.sum()}"
            assert torch.allclose(
                ref_out, slice_out), f"{ref_out.sum()} vs {slice_out.sum()}"
예제 #6
0
    def _norm_computation_worker(gpu_id: int, sync_file: str, world_size: int):
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True

        num_iterations = 10
        batch_size = 128
        torch.manual_seed(gpu_id)
        fake_inputs = torch.randn(size=(num_iterations, batch_size, 129))
        fake_targets = torch.randn(size=(num_iterations, batch_size))

        losses = {}
        for with_fsdp in [False, True]:
            torch.manual_seed(0)
            torch.cuda.manual_seed(0)
            losses[with_fsdp] = []

            # Create a simple model
            model = nn.Sequential(nn.Linear(129, 128), nn.ReLU(),
                                  nn.Linear(128, 10))
            model = model.cuda(gpu_id)

            # Setting up FSDP vs DDP with LARC
            larc_config = {
                "clip": False,
                "trust_coefficient": 0.01,
                "eps": 0.00000001
            }
            optimizer = optim.SGD(model.parameters(),
                                  lr=1e-2,
                                  weight_decay=1e-4,
                                  momentum=0.9)
            if with_fsdp:
                model = FullyShardedDataParallel(model,
                                                 flatten_parameters=False)
                optimizer = LARC_FSDP(optimizer,
                                      distributed_norm=True,
                                      **larc_config)
            else:
                model = DistributedDataParallel(model, device_ids=[gpu_id])
                optimizer = LARC_FSDP(optimizer,
                                      distributed_norm=False,
                                      **larc_config)

            # Training loop
            criterion = nn.MSELoss()
            for iteration in range(num_iterations):
                fake_input = fake_inputs[iteration].cuda(gpu_id)
                fake_target = fake_targets[iteration].cuda(gpu_id)
                output = model(fake_input)
                loss = criterion(output.sum(axis=-1), fake_target)
                if gpu_id == 0:
                    losses[with_fsdp].append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if gpu_id == 0:
            for with_fsdp in [False, True]:
                print(losses[with_fsdp])
                if world_size > 1:
                    losses[with_fsdp] = [
                        round(loss, 5) for loss in losses[with_fsdp]
                    ]
            assert losses[False] == losses[True]
    def _layer_memory_tracking_worker(gpu_id: int, sync_file: str,
                                      world_size: int):
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True
        torch.manual_seed(gpu_id)

        batch_size = 16
        fake_inputs = torch.randn(size=(batch_size, 10)).cuda(gpu_id)
        fake_targets = torch.randn(size=(batch_size, 10)).cuda(gpu_id)
        fake_criterion = nn.MSELoss()

        torch.manual_seed(0)
        torch.cuda.manual_seed(0)

        # Create a global group and a tracker around it
        group = dist.new_group()
        group = ProcessGroupTracker(group)

        # Create a simple model
        model = nn.Sequential(
            nn.Linear(10, 10).cuda(gpu_id),
            nn.ReLU(),
            FullyShardedDataParallel(
                nn.Linear(10, 10).cuda(gpu_id),
                flatten_parameters=False,
                process_group=group,
            ),
            nn.ReLU(),
            FullyShardedDataParallel(
                nn.Linear(10, 10).cuda(gpu_id),
                flatten_parameters=True,
                process_group=group,
            ),
        )
        model = model.cuda(gpu_id)
        model = FullyShardedDataParallel(model,
                                         flatten_parameters=False,
                                         process_group=group)

        # Setup the tracking of the model
        tracker = LayerwiseMemoryTracker()
        tracker.monitor(model)

        # Fake forward / backward pass
        fake_criterion(model(fake_inputs), fake_targets).backward()

        # Collect results of all gathers (the feature specific to FSDP)
        tracker.stop()
        all_gathered_traces = [
            (t.module_name, t.all_gathered, t.cumul_all_gathered)
            for t in tracker.memory_traces if t.all_gathered > 0
        ]
        assert all_gathered_traces == [
            ("_fsdp_wrapped_module.0", 440, 440),
            ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 880),
            ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440,
             880),
            ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440,
             0),
            ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 0),
        ]