def run_cluster_assignment(self, with_fsdp: bool):
        with in_temporary_directory() as pretrain_dir:

            # Pre-train a SwAV model in order to get some weights
            pretrain_config = self._create_pretraining_config(with_fsdp=with_fsdp)
            run_integration_test(pretrain_config)

            # Extract the cluster assignments of each sample
            with in_temporary_directory() as extract_dir:
                extract_config = self._create_extract_cluster_config(
                    with_fsdp=with_fsdp,
                    checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                )
                run_integration_test(extract_config, engine_name="extract_cluster")
                self.assertIn("cluster_assignments.torch", os.listdir(extract_dir))
                shutil.move(
                    src=os.path.join(extract_dir, "cluster_assignments.torch"),
                    dst=os.path.join(pretrain_dir, "cluster_assignments.torch"),
                )

            # Load the cluster assignments and check their structure
            assignments = ClusterAssignmentLoader.load_cluster_assigment(
                "cluster_assignments.torch"
            )
            self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"]))
            self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
示例#2
0
    def test_fine_tuning_end_to_end_fsdp(self):
        with in_temporary_directory() as pretrain_dir:
            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config(
                with_fsdp=True, fsdp_flatten_parameters=True)
            run_integration_test(pretrain_config)
            sharded_checkpoint_path = os.path.join(pretrain_dir,
                                                   "checkpoint.torch")
            sliced_checkpoint_path = os.path.join(pretrain_dir, "sliced.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                input_checkpoint_path=sharded_checkpoint_path,
                output_checkpoint_path=sliced_checkpoint_path,
            )

            # Create a separate directly in which to run the fine-tuning
            with in_temporary_directory():
                finetune_config = self._create_finetuning_config(
                    sliced_checkpoint_path,
                    construct_single_param_group_only=False,
                    regularize_bias=False,
                    with_fsdp=True,
                    fsdp_flatten_parameters=False,
                )
                result = run_integration_test(finetune_config)
                accuracies = result.get_accuracies(from_metrics_file=True)
                self.assertEqual(4, len(accuracies))
示例#3
0
    def run_cluster_assignment(self, with_fsdp: bool):
        with in_temporary_directory() as pretrain_dir:

            # Pre-train a SwAV model in order to get some weights
            pretrain_config = self._create_pretraining_config(
                with_fsdp=with_fsdp)
            run_integration_test(pretrain_config)

            # Extract the cluster assignments of each sample
            with in_temporary_directory() as extract_dir:
                extract_config = self._create_extract_cluster_config(
                    with_fsdp=with_fsdp,
                    checkpoint_path=os.path.join(pretrain_dir,
                                                 "checkpoint.torch"),
                )
                extract_config.EXTRACT_FEATURES.CHUNK_THRESHOLD = 10
                run_integration_test(extract_config,
                                     engine_name="extract_cluster")
                extraction_outputs = os.listdir(extract_dir)

                # Check that the cluster assignments are computed in both
                # compact format and dataset disk_filelist format
                self.assertIn("cluster_assignments.torch", extraction_outputs)
                self.assertIn("train_images.npy", extraction_outputs)
                self.assertIn("train_labels.npy", extraction_outputs)
                self.assertIn("test_images.npy", extraction_outputs)
                self.assertIn("test_labels.npy", extraction_outputs)

                # Check that the soft assignments (on prototypes) are exported
                for rank in range(2):
                    for chunk in range(2):
                        file_name = f"rank{rank}_chunk{chunk}_train_heads_protos.npy"
                        self.assertIn(file_name, extraction_outputs)
                        self.assertEqual(np.load(file_name).shape[1], 3000)
                    file_name = f"rank{rank}_chunk0_test_heads_protos.npy"
                    self.assertIn(file_name, extraction_outputs)
                    self.assertEqual(np.load(file_name).shape[1], 3000)

                # Copy the cluster assignments
                shutil.move(
                    src=os.path.join(extract_dir, "cluster_assignments.torch"),
                    dst=os.path.join(pretrain_dir,
                                     "cluster_assignments.torch"),
                )

            # Load the cluster assignments and check their structure
            assignments = ClusterAssignmentLoader.load_cluster_assigment(
                "cluster_assignments.torch")
            self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"]))
            self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
    def test_extract_cluster_assignment_ddp(self):
        with in_temporary_directory() as pretrain_dir:

            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config()
            run_integration_test(pretrain_config)

            # Create a directory to contain the extracted features
            with in_temporary_directory() as extract_dir:

                # Run the extract engine in a separate directory to check that
                # it is correctly able to output the feature in a another dir
                with in_temporary_directory():
                    extract_config = self._create_extract_features_config(
                        checkpoint_path=os.path.join(pretrain_dir,
                                                     "checkpoint.torch"))
                    extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                    run_integration_test(extract_config,
                                         engine_name="extract_features")

                # Check the content of the directory containing the extracted dirs
                folder_content = os.listdir(extract_dir)
                print(folder_content)
                for rank in [0, 1]:
                    for chunk in range(5):
                        for file in [
                                f"rank{rank}_chunk{chunk}_train_heads_features.npy",
                                f"rank{rank}_chunk{chunk}_train_heads_inds.npy",
                                f"rank{rank}_chunk{chunk}_train_heads_targets.npy",
                        ]:
                            self.assertIn(file, folder_content)

                # Verify that we can merge the features back (train split)
                train_feat = merge_features(extract_dir, "train", "heads")
                print(train_feat)
                self.assertEqual(train_feat["features"].shape,
                                 torch.Size([40, 128]))
                self.assertEqual(train_feat["targets"].shape,
                                 torch.Size([40, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([40]))

                # Verify that we can merge the features back (test split)
                test_feat = merge_features(extract_dir, "test", "heads")
                self.assertEqual(test_feat["features"].shape,
                                 torch.Size([20, 128]))
                self.assertEqual(test_feat["targets"].shape,
                                 torch.Size([20, 1]))
                self.assertEqual(test_feat["inds"].shape, torch.Size([20]))
    def test_get_shard_file_names(self):
        with in_temporary_directory() as temp_dir:

            # Generate a bunch of split/feature files
            for split in ["train", "test"]:
                for layer in ["heads", "res5"]:
                    self.prepare_data(split=split,
                                      layer=layer,
                                      num_shards=2,
                                      feat_shape=(10, 16))

            # Check that we only consider the right files
            paths = ExtractedFeaturesLoader.get_shard_file_names(
                input_dir=temp_dir, split="train", layer="heads")
            feature_files = {
                os.path.split(path.feature_file)[1]
                for path in paths
            }
            self.assertEqual(
                feature_files,
                {
                    "chunk0_train_heads_features.npy",
                    "chunk1_train_heads_features.npy"
                },
            )
示例#6
0
 def run_config(self, config, with_memory: bool = False):
     with in_temporary_directory():
         result = run_integration_test(config)
         losses = result.get_losses()
         if with_memory:
             return losses, result.get_peak_memory()
         return losses
    def test_merge_features(self):
        with in_temporary_directory() as temp_dir:

            # Save the data we need to merge back
            indices, features, targets = self.prepare_data(split="train",
                                                           layer="heads",
                                                           num_shards=4,
                                                           feat_shape=(10, 16))

            # Load the data and verify that it is identical
            output = ExtractedFeaturesLoader.load_features(input_dir=temp_dir,
                                                           split="train",
                                                           layer="heads")
            self.assertEqual(output["features"].shape[0], 40)
            self.assertTrue(np.array_equal(output["inds"], indices))
            self.assertTrue(np.array_equal(output["targets"], targets))
            self.assertTrue(np.allclose(output["features"], features))

            # Sample the all data (no sampling) and check that it is identical
            output = ExtractedFeaturesLoader.sample_features(
                input_dir=temp_dir,
                split="train",
                layer="heads",
                num_samples=-1,
                seed=0)
            self.assertEqual(output["features"].shape[0], 40)
            self.assertTrue(np.array_equal(output["inds"], indices))
            self.assertTrue(np.array_equal(output["targets"], targets))
            self.assertTrue(np.allclose(output["features"], features))
示例#8
0
    def run_benchmarking_preemption_test(
        self,
        checkpoint_path: str,
        with_fsdp: bool,
        with_eval_mlp: bool,
        num_gpu: int = 2,
    ):
        with in_temporary_directory() as temp_dir:
            config = self._create_benchmark_config(
                checkpoint_path,
                with_fsdp=with_fsdp,
                with_eval_mlp=with_eval_mlp,
                num_gpu=num_gpu,
            )
            config.CHECKPOINT.DIR = temp_dir
            results = run_integration_test(config)
            initial_losses = results.get_losses()

            results.clean_final_checkpoint()
            results.clean_logs()

            results = run_integration_test(config)
            restart_losses = results.get_losses()

            print("INITIAL:", initial_losses)
            print("RESTART:", restart_losses)

            self.assertEqual(initial_losses[5:], restart_losses)
示例#9
0
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")

            # Load the checkpoint and perform a linear evaluation on it
            losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir,
                                             "checkpoint_conso.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(losses))
            print(losses)
示例#10
0
 def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool, num_gpu: int = 2):
     with in_temporary_directory() as temp_dir:
         config = self._create_benchmark_config(
             checkpoint_path, with_fsdp=with_fsdp, num_gpu=num_gpu
         )
         config.CHECKPOINT.DIR = temp_dir
         results = run_integration_test(config)
         return results.get_losses(), results.get_accuracies(from_metrics_file=True)
示例#11
0
    def test_fine_tuning_end_to_end(self):
        with in_temporary_directory() as pretrain_dir:
            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config()
            run_integration_test(pretrain_config)
            checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch")

            # Create a separate directly in which to run the fine-tuning
            with in_temporary_directory():
                finetune_config = self._create_finetuning_config(
                    checkpoint_path,
                    construct_single_param_group_only=False,
                    regularize_bias=False,
                )
                result = run_integration_test(finetune_config)
                accuracies = result.get_accuracies(from_metrics_file=True)
                self.assertEqual(4, len(accuracies))
示例#12
0
 def test_regnet_10b_swav_pretraining(self):
     with in_temporary_directory():
         config = self._create_10B_pretrain_config(
             num_gpus=8, num_steps=2, batch_size=4
         )
         results = run_integration_test(config)
         losses = results.get_losses()
         print(losses)
         self.assertEqual(len(losses), 2)
示例#13
0
    def test_augly_transforms(self):
        cfg = compose_hydra_configuration([
            "config=test/cpu_test/test_cpu_resnet_simclr.yaml",
            "+config/test/transforms=augly_transforms_example",
        ], )
        _, config = convert_to_attrdict(cfg)

        with in_temporary_directory() as _:
            # Test that the training runs with an augly transformation.
            run_integration_test(config)
示例#14
0
 def test_legacy_profiler(self):
     with in_temporary_directory() as output_dir:
         config = self._create_config(force_legacy_profiler=True)
         run_integration_test(config)
         files = set(os.listdir(output_dir))
         print(files)
         self.assertIn("cuda_time_rank0.txt", files)
         self.assertIn("cuda_memory_usage_rank0.txt", files)
         self.assertIn("cpu_time_rank0.txt", files)
         self.assertIn("profiler_chrome_trace_rank0.json", files)
 def test_checkpoint_consolidation(self):
     with in_temporary_directory():
         for with_heads in [True, False]:
             with with_temp_files(count=1) as sync_file:
                 world_size = 2
                 mp.spawn(
                     self._worker,
                     (sync_file, world_size, with_heads),
                     nprocs=world_size,
                 )
示例#16
0
 def test_regnet_10b_evaluation(self):
     with in_temporary_directory():
         cp_path = "/checkpoint/qduval/vissl/seer/regnet10B_sliced/model_iteration124500_sliced.torch"
         config = self._create_10B_evaluation_config(
             num_gpus=8,
             num_steps=2,
             batch_size=4,
             path_to_sliced_checkpoint=cp_path)
         results = run_integration_test(config)
         losses = results.get_losses()
         print(losses)
         self.assertGreater(len(losses), 0)
示例#17
0
    def test_benchmarking_from_sharded_checkpoint(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoing
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # Verify that FSDP can load the checkpoint and run a benchmark on it
            fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertGreaterEqual(len(fsdp_losses), 0)
            self.assertEqual(4, len(fsdp_accuracies))
示例#18
0
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights (3 different ways)
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
            CheckpointFormatConverter.consolidated_to_sliced_checkpoint(
                "checkpoint_conso.torch", "checkpoint_sliced_2.torch")

            # Load the sharded checkpoint and perform a inear evaluation on it
            ref_losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(ref_losses))

            # Then check that the results are the same for the other kind of
            # checkpoints after consolidation has taken place
            for checkpoint_name in [
                    "checkpoint_conso.torch",
                    "checkpoint_sliced.torch",
                    "checkpoint_sliced_2.torch",
            ]:
                losses = self.run_linear_eval(
                    checkpoint_path=os.path.join(pretrain_dir,
                                                 checkpoint_name),
                    with_fsdp=True,
                    with_mixed_precision=False,
                    auto_wrap_threshold=0,
                )
                self.assertEqual(8, len(losses))
                self.assertAlmostEqual(
                    losses[0],
                    ref_losses[0],
                    places=4,
                    msg=f"Failed for {checkpoint_name}",
                )
 def run_pretraining(
     self,
     with_fsdp: bool,
     with_activation_checkpointing: bool,
     with_mixed_precision: bool,
 ):
     with in_temporary_directory():
         config = self._create_pretraining_config(
             with_fsdp=with_fsdp,
             with_activation_checkpointing=with_activation_checkpointing,
             with_mixed_precision=with_mixed_precision,
         )
         result = run_integration_test(config)
         return result.get_losses()
示例#20
0
    def test_pretraining_and_evaluation(self):
        with in_temporary_directory() as pretrain_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=True, gpu_count=2, num_epochs=1)
            result = run_integration_test(config)
            ddp_losses = result.get_losses()
            self.assertGreater(len(ddp_losses), 0)

            eval_config = self._create_dino_linear_eval_config(
                checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                gpu_count=2,
            )
            eval_losses = self.run_config(eval_config)
            print(eval_losses)
示例#21
0
    def test_benchmarking_from_sharded_checkpoint_with_preemption(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoing
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch")

            # Verify that FSDP can load the checkpoint and run a benchmark on it
            # and that it can restart from a preemption of the benchmark
            self.run_benchmarking_preemption_test(
                checkpoint_path, with_fsdp=True, with_eval_mlp=True
            )
            self.run_benchmarking_preemption_test(
                checkpoint_path, with_fsdp=True, with_eval_mlp=False
            )
示例#22
0
    def test_benchmarking_from_a_consolidated_checkpoint(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in DDP mode and save a consolidated checkpoint
            config = self._create_pretraining_config(with_fsdp=False)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # Now, run both DDP and FSDP linear evaluation and compare the traces
            ddp_losses, ddp_accuracies = self.run_benchmarking(checkpoint_path,
                                                               with_fsdp=False)
            fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertEqual(ddp_losses, fsdp_losses)
            self.assertEqual(ddp_accuracies, fsdp_accuracies)
示例#23
0
 def run_linear_eval(
     self,
     checkpoint_path: str,
     with_fsdp: bool,
     with_mixed_precision: bool,
     auto_wrap_threshold: int = 0,
 ):
     with in_temporary_directory():
         config = self._create_linear_evaluation_config(
             with_fsdp=with_fsdp,
             with_mixed_precision=with_mixed_precision,
             auto_wrap_threshold=auto_wrap_threshold,
         )
         config.MODEL.WEIGHTS_INIT.PARAMS_FILE = checkpoint_path
         result = run_integration_test(config)
         return result.get_losses()
示例#24
0
 def run_pretraining(
     self,
     with_fsdp: bool,
     with_activation_checkpointing: bool,
     with_mixed_precision: bool,
     auto_wrap_threshold: int = 0,
     force_sync_all_gather: bool = False,
 ):
     with in_temporary_directory():
         config = self._create_pretraining_config(
             with_fsdp=with_fsdp,
             with_activation_checkpointing=with_activation_checkpointing,
             with_mixed_precision=with_mixed_precision,
             auto_wrap_threshold=auto_wrap_threshold,
             force_sync_all_gather=force_sync_all_gather,
         )
         result = run_integration_test(config)
         return result.get_losses()
示例#25
0
    def test_ema_hook(self):
        cfg = compose_hydra_configuration(
            [
                "config=test/integration_test/quick_eval_in1k_linear.yaml",
                "config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
                "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]",
                "config.DATA.TEST.DATA_SOURCES=[synthetic]",
                "config.DATA.TEST.LABEL_SOURCES=[synthetic]",
                "config.DATA.TRAIN.DATA_LIMIT=40",
                "config.OPTIMIZER.num_epochs=2",
                "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True",
                "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True",
                "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu",
            ],
        )
        _, config = convert_to_attrdict(cfg)

        with in_temporary_directory() as checkpoint_folder:
            # Run a quick_eval_in1k_linear.
            integration_logs = run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch")

            # Test that the ema model is saved in the checkpoint.
            checkpoint = load_checkpoint(checkpoint_path)
            self.assertTrue(
                "ema_model" in checkpoint["classy_state_dict"].keys(),
                msg="ema_model has not been saved to the checkpoint folder.",
            )

            # Test that train_accuracy_list_meter_ema have been logged to metrics.json.
            metrics = integration_logs.get_accuracies(from_metrics_file=True)
            self.assertTrue(
                "train_accuracy_list_meter_ema" in metrics[1],
                msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.",
            )

            self.assertEqual(
                len(metrics),
                8,
                "the metrics.json output does not have the appropriate number of entries.",
            )
示例#26
0
    def test_prehemption_during_training(self):
        with in_temporary_directory() as temp_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=False, gpu_count=2)
            result = run_integration_test(config)
            losses_before = result.get_losses()

            temp_dir_content = os.listdir(temp_dir)
            self.assertIn("model_final_checkpoint_phase3.torch",
                          temp_dir_content)
            os.remove("model_final_checkpoint_phase3.torch")
            os.remove("checkpoint.torch")
            os.remove("log.txt")

            result = run_integration_test(config)
            losses_after = result.get_losses()
            print(losses_before)
            print(losses_after)
            self.assertAlmostEqual(losses_after[-1],
                                   losses_before[-1],
                                   places=5)
示例#27
0
    def test_dino_xcit_prehemption(self):
        with in_temporary_directory() as temp_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=False, gpu_count=2
            )

            # For deterministic computing
            config.MODEL.TRUNK.XCIT.DROP_PATH_RATE = 0.0

            result = run_integration_test(config)
            losses_before = result.get_losses()

            temp_dir_content = os.listdir(temp_dir)
            self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content)
            os.remove("model_final_checkpoint_phase3.torch")
            os.remove("checkpoint.torch")
            os.remove("log.txt")

            result = run_integration_test(config)
            losses_after = result.get_losses()
            print(losses_before)
            print(losses_after)
            self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=4)
示例#28
0
    def test_sample_features(self):
        with in_temporary_directory() as temp_dir:
            # Save the data we need to sample from
            indices, features, targets = self.prepare_data(split="train",
                                                           layer="heads",
                                                           num_shards=4,
                                                           feat_shape=(10, 16))

            # Load the data and verify that it is identical
            output = ExtractedFeaturesLoader.sample_features(
                input_dir=temp_dir,
                split="train",
                layer="heads",
                num_samples=10,
                seed=0)

            # Check that the number of samples is valid
            self.assertEqual(10, len(output["inds"]))

            # Check that the samples are a subset of the original dataset
            self.assertTrue(
                np.array_equal(output["features"], features[output["inds"]]))
            self.assertTrue(
                np.array_equal(output["targets"], targets[output["inds"]]))
示例#29
0
 def test_restart_after_preemption_at_iteration(self):
     with in_temporary_directory():
         config = self._create_pretraining_config(with_fsdp=False)
         config.CHECKPOINT.CHECKPOINT_ITER_FREQUENCY = 3
         # TODO - understand why the losses do not match exactly for iteration preemption
         self.run_preemption_test(config, compare_losses=False)
示例#30
0
 def test_restart_after_preemption_at_epoch_fsdp(self):
     with in_temporary_directory():
         config = self._create_pretraining_config(with_fsdp=True)
         config.OPTIMIZER.num_epochs = 2
         self.run_preemption_test(config)