def run_cluster_assignment(self, with_fsdp: bool): with in_temporary_directory() as pretrain_dir: # Pre-train a SwAV model in order to get some weights pretrain_config = self._create_pretraining_config(with_fsdp=with_fsdp) run_integration_test(pretrain_config) # Extract the cluster assignments of each sample with in_temporary_directory() as extract_dir: extract_config = self._create_extract_cluster_config( with_fsdp=with_fsdp, checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), ) run_integration_test(extract_config, engine_name="extract_cluster") self.assertIn("cluster_assignments.torch", os.listdir(extract_dir)) shutil.move( src=os.path.join(extract_dir, "cluster_assignments.torch"), dst=os.path.join(pretrain_dir, "cluster_assignments.torch"), ) # Load the cluster assignments and check their structure assignments = ClusterAssignmentLoader.load_cluster_assigment( "cluster_assignments.torch" ) self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"])) self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
def test_fine_tuning_end_to_end_fsdp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config( with_fsdp=True, fsdp_flatten_parameters=True) run_integration_test(pretrain_config) sharded_checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch") sliced_checkpoint_path = os.path.join(pretrain_dir, "sliced.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( input_checkpoint_path=sharded_checkpoint_path, output_checkpoint_path=sliced_checkpoint_path, ) # Create a separate directly in which to run the fine-tuning with in_temporary_directory(): finetune_config = self._create_finetuning_config( sliced_checkpoint_path, construct_single_param_group_only=False, regularize_bias=False, with_fsdp=True, fsdp_flatten_parameters=False, ) result = run_integration_test(finetune_config) accuracies = result.get_accuracies(from_metrics_file=True) self.assertEqual(4, len(accuracies))
def run_cluster_assignment(self, with_fsdp: bool): with in_temporary_directory() as pretrain_dir: # Pre-train a SwAV model in order to get some weights pretrain_config = self._create_pretraining_config( with_fsdp=with_fsdp) run_integration_test(pretrain_config) # Extract the cluster assignments of each sample with in_temporary_directory() as extract_dir: extract_config = self._create_extract_cluster_config( with_fsdp=with_fsdp, checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), ) extract_config.EXTRACT_FEATURES.CHUNK_THRESHOLD = 10 run_integration_test(extract_config, engine_name="extract_cluster") extraction_outputs = os.listdir(extract_dir) # Check that the cluster assignments are computed in both # compact format and dataset disk_filelist format self.assertIn("cluster_assignments.torch", extraction_outputs) self.assertIn("train_images.npy", extraction_outputs) self.assertIn("train_labels.npy", extraction_outputs) self.assertIn("test_images.npy", extraction_outputs) self.assertIn("test_labels.npy", extraction_outputs) # Check that the soft assignments (on prototypes) are exported for rank in range(2): for chunk in range(2): file_name = f"rank{rank}_chunk{chunk}_train_heads_protos.npy" self.assertIn(file_name, extraction_outputs) self.assertEqual(np.load(file_name).shape[1], 3000) file_name = f"rank{rank}_chunk0_test_heads_protos.npy" self.assertIn(file_name, extraction_outputs) self.assertEqual(np.load(file_name).shape[1], 3000) # Copy the cluster assignments shutil.move( src=os.path.join(extract_dir, "cluster_assignments.torch"), dst=os.path.join(pretrain_dir, "cluster_assignments.torch"), ) # Load the cluster assignments and check their structure assignments = ClusterAssignmentLoader.load_cluster_assigment( "cluster_assignments.torch") self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"])) self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
def test_extract_cluster_assignment_ddp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config() run_integration_test(pretrain_config) # Create a directory to contain the extracted features with in_temporary_directory() as extract_dir: # Run the extract engine in a separate directory to check that # it is correctly able to output the feature in a another dir with in_temporary_directory(): extract_config = self._create_extract_features_config( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch")) extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test(extract_config, engine_name="extract_features") # Check the content of the directory containing the extracted dirs folder_content = os.listdir(extract_dir) print(folder_content) for rank in [0, 1]: for chunk in range(5): for file in [ f"rank{rank}_chunk{chunk}_train_heads_features.npy", f"rank{rank}_chunk{chunk}_train_heads_inds.npy", f"rank{rank}_chunk{chunk}_train_heads_targets.npy", ]: self.assertIn(file, folder_content) # Verify that we can merge the features back (train split) train_feat = merge_features(extract_dir, "train", "heads") print(train_feat) self.assertEqual(train_feat["features"].shape, torch.Size([40, 128])) self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([40])) # Verify that we can merge the features back (test split) test_feat = merge_features(extract_dir, "test", "heads") self.assertEqual(test_feat["features"].shape, torch.Size([20, 128])) self.assertEqual(test_feat["targets"].shape, torch.Size([20, 1])) self.assertEqual(test_feat["inds"].shape, torch.Size([20]))
def test_get_shard_file_names(self): with in_temporary_directory() as temp_dir: # Generate a bunch of split/feature files for split in ["train", "test"]: for layer in ["heads", "res5"]: self.prepare_data(split=split, layer=layer, num_shards=2, feat_shape=(10, 16)) # Check that we only consider the right files paths = ExtractedFeaturesLoader.get_shard_file_names( input_dir=temp_dir, split="train", layer="heads") feature_files = { os.path.split(path.feature_file)[1] for path in paths } self.assertEqual( feature_files, { "chunk0_train_heads_features.npy", "chunk1_train_heads_features.npy" }, )
def run_config(self, config, with_memory: bool = False): with in_temporary_directory(): result = run_integration_test(config) losses = result.get_losses() if with_memory: return losses, result.get_peak_memory() return losses
def test_merge_features(self): with in_temporary_directory() as temp_dir: # Save the data we need to merge back indices, features, targets = self.prepare_data(split="train", layer="heads", num_shards=4, feat_shape=(10, 16)) # Load the data and verify that it is identical output = ExtractedFeaturesLoader.load_features(input_dir=temp_dir, split="train", layer="heads") self.assertEqual(output["features"].shape[0], 40) self.assertTrue(np.array_equal(output["inds"], indices)) self.assertTrue(np.array_equal(output["targets"], targets)) self.assertTrue(np.allclose(output["features"], features)) # Sample the all data (no sampling) and check that it is identical output = ExtractedFeaturesLoader.sample_features( input_dir=temp_dir, split="train", layer="heads", num_samples=-1, seed=0) self.assertEqual(output["features"].shape[0], 40) self.assertTrue(np.array_equal(output["inds"], indices)) self.assertTrue(np.array_equal(output["targets"], targets)) self.assertTrue(np.allclose(output["features"], features))
def run_benchmarking_preemption_test( self, checkpoint_path: str, with_fsdp: bool, with_eval_mlp: bool, num_gpu: int = 2, ): with in_temporary_directory() as temp_dir: config = self._create_benchmark_config( checkpoint_path, with_fsdp=with_fsdp, with_eval_mlp=with_eval_mlp, num_gpu=num_gpu, ) config.CHECKPOINT.DIR = temp_dir results = run_integration_test(config) initial_losses = results.get_losses() results.clean_final_checkpoint() results.clean_logs() results = run_integration_test(config) restart_losses = results.get_losses() print("INITIAL:", initial_losses) print("RESTART:", restart_losses) self.assertEqual(initial_losses[5:], restart_losses)
def test_fsdp_integration_with_linear_eval(self): with in_temporary_directory() as pretrain_dir: # Start pre-training config = self._create_pretraining_config( with_fsdp=True, with_activation_checkpointing=True, with_mixed_precision=False, auto_wrap_threshold=0, ) run_integration_test(config) # Consolidate the weights CheckpointFormatConverter.sharded_to_consolidated_checkpoint( "checkpoint.torch", "checkpoint_conso.torch") # Load the checkpoint and perform a linear evaluation on it losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, "checkpoint_conso.torch"), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(losses)) print(losses)
def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool, num_gpu: int = 2): with in_temporary_directory() as temp_dir: config = self._create_benchmark_config( checkpoint_path, with_fsdp=with_fsdp, num_gpu=num_gpu ) config.CHECKPOINT.DIR = temp_dir results = run_integration_test(config) return results.get_losses(), results.get_accuracies(from_metrics_file=True)
def test_fine_tuning_end_to_end(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config() run_integration_test(pretrain_config) checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch") # Create a separate directly in which to run the fine-tuning with in_temporary_directory(): finetune_config = self._create_finetuning_config( checkpoint_path, construct_single_param_group_only=False, regularize_bias=False, ) result = run_integration_test(finetune_config) accuracies = result.get_accuracies(from_metrics_file=True) self.assertEqual(4, len(accuracies))
def test_regnet_10b_swav_pretraining(self): with in_temporary_directory(): config = self._create_10B_pretrain_config( num_gpus=8, num_steps=2, batch_size=4 ) results = run_integration_test(config) losses = results.get_losses() print(losses) self.assertEqual(len(losses), 2)
def test_augly_transforms(self): cfg = compose_hydra_configuration([ "config=test/cpu_test/test_cpu_resnet_simclr.yaml", "+config/test/transforms=augly_transforms_example", ], ) _, config = convert_to_attrdict(cfg) with in_temporary_directory() as _: # Test that the training runs with an augly transformation. run_integration_test(config)
def test_legacy_profiler(self): with in_temporary_directory() as output_dir: config = self._create_config(force_legacy_profiler=True) run_integration_test(config) files = set(os.listdir(output_dir)) print(files) self.assertIn("cuda_time_rank0.txt", files) self.assertIn("cuda_memory_usage_rank0.txt", files) self.assertIn("cpu_time_rank0.txt", files) self.assertIn("profiler_chrome_trace_rank0.json", files)
def test_checkpoint_consolidation(self): with in_temporary_directory(): for with_heads in [True, False]: with with_temp_files(count=1) as sync_file: world_size = 2 mp.spawn( self._worker, (sync_file, world_size, with_heads), nprocs=world_size, )
def test_regnet_10b_evaluation(self): with in_temporary_directory(): cp_path = "/checkpoint/qduval/vissl/seer/regnet10B_sliced/model_iteration124500_sliced.torch" config = self._create_10B_evaluation_config( num_gpus=8, num_steps=2, batch_size=4, path_to_sliced_checkpoint=cp_path) results = run_integration_test(config) losses = results.get_losses() print(losses) self.assertGreater(len(losses), 0)
def test_benchmarking_from_sharded_checkpoint(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in FSDP mode and save a sharded checkpoing config = self._create_pretraining_config(with_fsdp=True) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Verify that FSDP can load the checkpoint and run a benchmark on it fsdp_losses, fsdp_accuracies = self.run_benchmarking( checkpoint_path, with_fsdp=True) self.assertGreaterEqual(len(fsdp_losses), 0) self.assertEqual(4, len(fsdp_accuracies))
def test_fsdp_integration_with_linear_eval(self): with in_temporary_directory() as pretrain_dir: # Start pre-training config = self._create_pretraining_config( with_fsdp=True, with_activation_checkpointing=True, with_mixed_precision=False, auto_wrap_threshold=0, ) run_integration_test(config) # Consolidate the weights (3 different ways) CheckpointFormatConverter.sharded_to_consolidated_checkpoint( "checkpoint.torch", "checkpoint_conso.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( "checkpoint.torch", "checkpoint_sliced.torch") CheckpointFormatConverter.consolidated_to_sliced_checkpoint( "checkpoint_conso.torch", "checkpoint_sliced_2.torch") # Load the sharded checkpoint and perform a inear evaluation on it ref_losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(ref_losses)) # Then check that the results are the same for the other kind of # checkpoints after consolidation has taken place for checkpoint_name in [ "checkpoint_conso.torch", "checkpoint_sliced.torch", "checkpoint_sliced_2.torch", ]: losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, checkpoint_name), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(losses)) self.assertAlmostEqual( losses[0], ref_losses[0], places=4, msg=f"Failed for {checkpoint_name}", )
def run_pretraining( self, with_fsdp: bool, with_activation_checkpointing: bool, with_mixed_precision: bool, ): with in_temporary_directory(): config = self._create_pretraining_config( with_fsdp=with_fsdp, with_activation_checkpointing=with_activation_checkpointing, with_mixed_precision=with_mixed_precision, ) result = run_integration_test(config) return result.get_losses()
def test_pretraining_and_evaluation(self): with in_temporary_directory() as pretrain_dir: config = self._create_dino_pretraining_config( with_mixed_precision=True, gpu_count=2, num_epochs=1) result = run_integration_test(config) ddp_losses = result.get_losses() self.assertGreater(len(ddp_losses), 0) eval_config = self._create_dino_linear_eval_config( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), gpu_count=2, ) eval_losses = self.run_config(eval_config) print(eval_losses)
def test_benchmarking_from_sharded_checkpoint_with_preemption(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in FSDP mode and save a sharded checkpoing config = self._create_pretraining_config(with_fsdp=True) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Verify that FSDP can load the checkpoint and run a benchmark on it # and that it can restart from a preemption of the benchmark self.run_benchmarking_preemption_test( checkpoint_path, with_fsdp=True, with_eval_mlp=True ) self.run_benchmarking_preemption_test( checkpoint_path, with_fsdp=True, with_eval_mlp=False )
def test_benchmarking_from_a_consolidated_checkpoint(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in DDP mode and save a consolidated checkpoint config = self._create_pretraining_config(with_fsdp=False) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Now, run both DDP and FSDP linear evaluation and compare the traces ddp_losses, ddp_accuracies = self.run_benchmarking(checkpoint_path, with_fsdp=False) fsdp_losses, fsdp_accuracies = self.run_benchmarking( checkpoint_path, with_fsdp=True) self.assertEqual(ddp_losses, fsdp_losses) self.assertEqual(ddp_accuracies, fsdp_accuracies)
def run_linear_eval( self, checkpoint_path: str, with_fsdp: bool, with_mixed_precision: bool, auto_wrap_threshold: int = 0, ): with in_temporary_directory(): config = self._create_linear_evaluation_config( with_fsdp=with_fsdp, with_mixed_precision=with_mixed_precision, auto_wrap_threshold=auto_wrap_threshold, ) config.MODEL.WEIGHTS_INIT.PARAMS_FILE = checkpoint_path result = run_integration_test(config) return result.get_losses()
def run_pretraining( self, with_fsdp: bool, with_activation_checkpointing: bool, with_mixed_precision: bool, auto_wrap_threshold: int = 0, force_sync_all_gather: bool = False, ): with in_temporary_directory(): config = self._create_pretraining_config( with_fsdp=with_fsdp, with_activation_checkpointing=with_activation_checkpointing, with_mixed_precision=with_mixed_precision, auto_wrap_threshold=auto_wrap_threshold, force_sync_all_gather=force_sync_all_gather, ) result = run_integration_test(config) return result.get_losses()
def test_ema_hook(self): cfg = compose_hydra_configuration( [ "config=test/integration_test/quick_eval_in1k_linear.yaml", "config.DATA.TRAIN.DATA_SOURCES=[synthetic]", "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]", "config.DATA.TEST.DATA_SOURCES=[synthetic]", "config.DATA.TEST.LABEL_SOURCES=[synthetic]", "config.DATA.TRAIN.DATA_LIMIT=40", "config.OPTIMIZER.num_epochs=2", "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True", "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True", "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu", ], ) _, config = convert_to_attrdict(cfg) with in_temporary_directory() as checkpoint_folder: # Run a quick_eval_in1k_linear. integration_logs = run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Test that the ema model is saved in the checkpoint. checkpoint = load_checkpoint(checkpoint_path) self.assertTrue( "ema_model" in checkpoint["classy_state_dict"].keys(), msg="ema_model has not been saved to the checkpoint folder.", ) # Test that train_accuracy_list_meter_ema have been logged to metrics.json. metrics = integration_logs.get_accuracies(from_metrics_file=True) self.assertTrue( "train_accuracy_list_meter_ema" in metrics[1], msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.", ) self.assertEqual( len(metrics), 8, "the metrics.json output does not have the appropriate number of entries.", )
def test_prehemption_during_training(self): with in_temporary_directory() as temp_dir: config = self._create_dino_pretraining_config( with_mixed_precision=False, gpu_count=2) result = run_integration_test(config) losses_before = result.get_losses() temp_dir_content = os.listdir(temp_dir) self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content) os.remove("model_final_checkpoint_phase3.torch") os.remove("checkpoint.torch") os.remove("log.txt") result = run_integration_test(config) losses_after = result.get_losses() print(losses_before) print(losses_after) self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=5)
def test_dino_xcit_prehemption(self): with in_temporary_directory() as temp_dir: config = self._create_dino_pretraining_config( with_mixed_precision=False, gpu_count=2 ) # For deterministic computing config.MODEL.TRUNK.XCIT.DROP_PATH_RATE = 0.0 result = run_integration_test(config) losses_before = result.get_losses() temp_dir_content = os.listdir(temp_dir) self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content) os.remove("model_final_checkpoint_phase3.torch") os.remove("checkpoint.torch") os.remove("log.txt") result = run_integration_test(config) losses_after = result.get_losses() print(losses_before) print(losses_after) self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=4)
def test_sample_features(self): with in_temporary_directory() as temp_dir: # Save the data we need to sample from indices, features, targets = self.prepare_data(split="train", layer="heads", num_shards=4, feat_shape=(10, 16)) # Load the data and verify that it is identical output = ExtractedFeaturesLoader.sample_features( input_dir=temp_dir, split="train", layer="heads", num_samples=10, seed=0) # Check that the number of samples is valid self.assertEqual(10, len(output["inds"])) # Check that the samples are a subset of the original dataset self.assertTrue( np.array_equal(output["features"], features[output["inds"]])) self.assertTrue( np.array_equal(output["targets"], targets[output["inds"]]))
def test_restart_after_preemption_at_iteration(self): with in_temporary_directory(): config = self._create_pretraining_config(with_fsdp=False) config.CHECKPOINT.CHECKPOINT_ITER_FREQUENCY = 3 # TODO - understand why the losses do not match exactly for iteration preemption self.run_preemption_test(config, compare_losses=False)
def test_restart_after_preemption_at_epoch_fsdp(self): with in_temporary_directory(): config = self._create_pretraining_config(with_fsdp=True) config.OPTIMIZER.num_epochs = 2 self.run_preemption_test(config)