def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 }, }, "scheduler": { "type": LR_RANGE_TEST, "params": { LR_RANGE_TEST_MIN_LR: min_lr, LR_RANGE_TEST_STEP_RATE: step_rate, LR_RANGE_TEST_STEP_SIZE: step_size, LR_RANGE_TEST_STAIRCASE: staircase } }, "gradient_clipping": 1.0 } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase): model, _, _, lr_scheduler = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=max(50, step_size * 2), hidden_dim=hidden_dim, device=model.device, dtype=torch.float) step_lrs = [] for _, batch in enumerate(data_loader): step_lrs.append(lr_scheduler.get_lr()) loss = model(batch[0], batch[1]) model.backward(loss) model.step() # Verify starting lr assert step_lrs[0] == min_lr if staircase: # Verify staircase increasing lr _verify_staircase_increase(step_lrs, step_size) else: # Verify continuous increasing lr _verify_continuous_increase(step_lrs) _test_lr_range_test(args=args, model=model, hidden_dim=hidden_dim, min_lr=[min_lr], step_size=step_size, staircase=staircase)
def test_lamb_fp16_empty_grad(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "fp16": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) @distributed_test(world_size=[2]) def _test_lamb_fp16_empty_grad(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
def test_zero_empty_partition(tmpdir): config_dict = { "train_batch_size": 3, "fp16": { "enabled": True }, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "zero_optimization": True } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[3]) def _test_zero_empty_partition(args): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) model.step() _test_zero_empty_partition(args)
def test_dataloader_drop_last(tmpdir, train_batch_size, drop_last): config_dict = { "train_batch_size": train_batch_size, "dataloader_drop_last": drop_last, "steps_per_print": 1 } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[1]) def _test_dataloader_drop_last(args, model, hidden_dim): optimizer = torch.optim.AdamW(params=model.parameters()) #TODO: Figure out why this breaks with cuda device train_dataset = random_dataset(total_samples=50, hidden_dim=hidden_dim, device=torch.device('cpu'), dtype=torch.float32) model, _, training_dataloader, _ = deepspeed.initialize( args=args, model=model, training_data=train_dataset, optimizer=optimizer) for n, batch in enumerate(training_dataloader): x = batch[0].to(torch.cuda.current_device()) y = batch[1].to(torch.cuda.current_device()) loss = model(x, y) model.backward(loss) model.step() _test_dataloader_drop_last(args=args, model=model, hidden_dim=hidden_dim)
def test_non_elastic_batch_params_w_override(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "elasticity": { "enabled": True, "max_train_batch_size": 4, "micro_batch_sizes": [1, 2, 3, 4], "min_gpus": 1, "max_gpus": 4, "min_time": 20, "version": 0.1, "ignore_non_elastic_batch_info": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1, 2]) def _test_elastic(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
def test_stage2_ignore_unused_parameters(tmpdir, ignore_unused_parameters): use_cpu_offload = True if use_cpu_offload and not deepspeed.ops.__compatible_ops__[ CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") config_dict = { "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 2, "steps_per_print": 1, "zero_optimization": { "stage": 2, "cpu_offload": use_cpu_offload, "ignore_unused_parameters": ignore_unused_parameters }, "optimizer": { "type": "Adam", "params": { "lr": 1e-3 } }, "fp16": { "enabled": True, "initial_scale_power": 8 } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 4 model = UnusedParametersModel(hidden_dim=hidden_dim) @distributed_test(world_size=[1]) def _test_stage2_ignore_unused_parameters(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) def _loop(): for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if ignore_unused_parameters: _loop() else: with pytest.raises(AssertionError) as e: _loop() assert e.value.args and 'ignore_unused_parameters' in e.value.args[ 0] _test_stage2_ignore_unused_parameters(args=args, model=model, hidden_dim=hidden_dim)
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[ CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "scheduler": { "type": "OneCycle", "params": { "cycle_first_step_size": 16000, "cycle_first_stair_count": 8000, "decay_step_size": 16000, "cycle_min_lr": 1e-06, "cycle_max_lr": 3e-05, "decay_lr_rate": 1e-07, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99, "decay_mom_rate": 0.0 } }, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 @distributed_test(world_size=[1]) def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim): model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adam_fp16_zero_onecycle_compatibility(args=args, zero_stage=zero_stage, hidden_dim=hidden_dim)
def test_zero_allow_untested_optimizer(tmpdir, zero_stage): config_dict = { "train_batch_size": 4, "steps_per_print": 1, "fp16": { "enabled": True, }, "zero_optimization": { "stage": zero_stage }, "zero_allow_untested_optimizer": False } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_zero_allow_untested_optimizer(args): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): model, optim, _, _ = deepspeed.initialize( args=args, model=model, optimizer=optimizer, model_parameters=model.parameters()) _test_zero_allow_untested_optimizer(args)
def test_client_optimizer(tmpdir, optimizer_type): def _optimizer_callable(params) -> Optimizer: return AdamW(params=params) hidden_dim = 10 model = SimpleModel(hidden_dim) config_dict = {'train_batch_size': 1} if optimizer_type is None: client_optimizer = None config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} elif optimizer_type is Optimizer: client_optimizer = Adam(model.parameters()) else: client_optimizer = _optimizer_callable args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_client_optimizer(args, model, client_optimizer): _, ds_optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=list(model.parameters()), optimizer=client_optimizer) if client_optimizer is None: assert isinstance(ds_optimizer, FusedAdam) elif isinstance(client_optimizer, Optimizer): assert ds_optimizer == client_optimizer else: assert isinstance(ds_optimizer, AdamW) _test_client_optimizer(args=args, model=model, client_optimizer=client_optimizer)
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[ CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") config_dict = { "train_batch_size": 4, "steps_per_print": 1, "fp16": { "enabled": True, }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload }, "zero_allow_untested_optimizer": False } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_zero_allow_untested_optimizer(args, zero_stage): hidden_dim = 10 model = SimpleModel(hidden_dim) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): model, optim, _, _ = deepspeed.initialize( args=args, model=model, optimizer=optimizer, model_parameters=model.parameters()) _test_zero_allow_untested_optimizer(args, zero_stage)
def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 @distributed_test(world_size=[1]) def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor): model = SimpleModel(hidden_dim) client_optimizer = optimizer_constructor(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=client_optimizer) _test_zero_supported_client_optimizer( args=args, zero_stage=zero_stage, optimizer_constructor=optimizer_constructor)
def test_adam_amp_basic(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "amp": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[1]) def _test_adam_amp_basic(args, model, hidden_dim): optimizer = torch.optim.Adam(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
def test_checkpoint_fp32_optimizer(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015, "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } }, "fp16": { "enabled": False } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_fp32_optimizer(args, model, hidden_dim): checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False) _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
def test_unfused_some_overflow(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 8, "loss_scale_window": 2 } } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=1) def _test_unfused_some_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 expected_iteration = 0 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale assert optim.scale_window == expected_scale_window # Run model with overflows to decrease scale overflow_gradients = [float('inf'), float('nan')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model scale_window + 1 times to increase scale once normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1) expected_iteration += len(normal_gradients) run_model_step(model, normal_gradients) expected_loss_scale *= 2 assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model with overflows to decrease scale overflow_gradients = [float('inf')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration _test_unfused_some_overflow(args)
def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool): if not required_torch_version(): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") config_dict = { "train_batch_size": 2, "steps_per_print": 1, "fp16": { "enabled": True }, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 def mock_unscale_and_clip_grads(norm_groups, apply_scale=True): total_norm = 0.0 for norm in norm_groups: total_norm += norm**2.0 total_norm = math.sqrt(total_norm) torch_norm_tensor = torch.cuda.FloatTensor([total_norm]) all_gather_results = [ torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size()) ] dist.all_gather(all_gather_results, torch_norm_tensor) assert len(set([x.item() for x in all_gather_results])) == 1 return 1.0 @distributed_test(world_size=[2]) def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy): # initialize MoE groups.initialize_model_parallel(1) groups.initialize_expert_parallel(2) model = SimpleMoEModel(hidden_dim) engine, optimizer, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters(), dist_init_required=False) monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads) optimizer.fused_lamb_legacy = fused_lamb_legacy data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device) for n, batch in enumerate(data_loader): loss = engine(batch[0], batch[1]) engine.backward(loss) engine.step() _test_lamb_legacy_optimizer_step(args=args, hidden_dim=hidden_dim, fused_lamb_legacy=fused_lamb_legacy)
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): if not bf16_required_version_check(): pytest.skip( " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) if use_cpu_offload and not deepspeed.ops.__compatible_ops__[ CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") if zero_stage == 3: pytest.skip("skip for now") config_dict = { "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, "fp16": { "enabled": False }, "bfloat16": { "enabled": True }, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload, "reduce_bucket_size": 100, "allgather_bucket_size": 100 } } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[3]) def _test_zero_empty_partition(args, zero_stage): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) # Now make sure things work.. data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device, dtype=torch.bfloat16) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_zero_empty_partition(args=args, zero_stage=zero_stage)
def test_zero3_repeat_forward_loop(tmpdir, zero_stage): # force all params to be partitioned by forcing threshold=0 config_dict = { "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 2, "steps_per_print": 1, "zero_optimization": { "stage": zero_stage, "stage3_param_persistence_threshold": 0 }, "optimizer": { "type": "Adam", "params": { "lr": 1e-3 } }, "fp16": { "enabled": True, "initial_scale_power": 8 } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 4 class AlbertLikeModel(torch.nn.Module): def __init__(self, hidden_dim): super().__init__() self.linear = torch.nn.Linear(hidden_dim, hidden_dim) self.cross_entropy_loss = torch.nn.CrossEntropyLoss() def forward(self, x, y): # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards hidden = x for i in range(3): hidden = hidden + self.linear(hidden) return self.cross_entropy_loss(hidden, y) model = AlbertLikeModel(hidden_dim=hidden_dim) @distributed_test(world_size=[1]) def _test_zero3_repeat_forward_loop(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_zero3_repeat_forward_loop(args=args, model=model, hidden_dim=hidden_dim)
def test_checkpoint_unfused_optimizer(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015, "max_grad_norm": 1.0 } }, "fp16": { "enabled": True }, "scheduler": { "type": "OneCycle", "params": { "cycle_first_step_size": 1000, "cycle_first_stair_count": 500, "cycle_second_step_size": 1000, "cycle_second_stair_count": 500, "decay_step_size": 1000, "cycle_min_lr": 0.0001, "cycle_max_lr": 0.0010, "decay_lr_rate": 0.001, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99, "decay_mom_rate": 0.0 } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_unfused_optimizer(args, model, hidden_dim, load_optimizer_states): checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, load_optimizer_states=load_optimizer_states) _test_checkpoint_unfused_optimizer(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=True) _test_checkpoint_unfused_optimizer(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=False)
def test_curriculum_scheduler_fixed_discrete(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015, "weight_decay": 0.01 } }, "gradient_clipping": 1.0, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 }, "curriculum_learning": { "enabled": True, "curriculum_type": "seqlen", "min_difficulty": 1, "max_difficulty": 5, "schedule_type": "fixed_discrete", "schedule_config": { "difficulty": [1, 2, 3, 4, 5], "max_step": [2, 4, 6, 8] } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4} model = Curriculum_SimpleModel(hidden_dim) @distributed_test(world_size=[1, 2]) def _test_curriculum_scheduler_fixed_discrete(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss, seqlen = model(batch[0], batch[1]) model.backward(loss) model.step() true_seqlen = 5 if n + 1 in ground_truths: true_seqlen = ground_truths[n + 1] print('at step {} the seqlen is {}'.format(n + 1, seqlen)) assert seqlen == true_seqlen, f"Incorrect curriculum schedule" _test_curriculum_scheduler_fixed_discrete(args=args, model=model, hidden_dim=hidden_dim)
def test_onebitlamb_checkpointing_overflow(tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00015, "weight_decay": 0.01, "max_coeff": 0.3, "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, "comm_backend_name": "nccl", "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, "factor_threshold": 0.1 } }, "gradient_clipping": 1.0, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[2]) def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, 'saved_checkpoint') for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if dist.get_rank() == 0 and n >= 10: loss = loss * 1000000.0 model.backward(loss) dist.barrier() model.step() dist.barrier() model.save_checkpoint(save_folder, tag=None) _test_onebitlamb_checkpointing_overflow(args=args, model=model, hidden_dim=hidden_dim)
def test_checkpoint_lr_scheduler(tmpdir, zero_stage): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015, "betas": [0.8, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } }, "fp16": { "enabled": True }, "zero_optimization": { "stage": zero_stage }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": 0, "warmup_max_lr": 0.001, "warmup_num_steps": 1000 } } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[2]) def _test_checkpoint_lr_scheduler(args, model, hidden_dim, load_optimizer_states, load_lr_scheduler_states): checkpoint_correctness_verification( args, model, hidden_dim, tmpdir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_lr_scheduler_states) _test_checkpoint_lr_scheduler(args=args, model=model, hidden_dim=hidden_dim, load_optimizer_states=False, load_lr_scheduler_states=True)
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") config_dict = { "train_batch_size": 4, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "fp16": { "enabled": True, "loss_scale": 138. }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload } } args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=2) def _test_zero_static_scale(args, zero_stage, hidden_dim): #making hidden size not divisible by DP for covering this scenario hidden_dim = hidden_dim model = SimpleModel(hidden_dim) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) # Ensure the static scaler is configured. assert optim.dynamic_loss_scale == False assert optim.loss_scaler.loss_scale == 138. # Now make sure things work.. data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() #test when hidden_dim is not aligned with world size _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9) #test when hidden_dim is aligned with world size _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
def test_zero2_reduce_scatter_off(tmpdir): if not bf16_required_version_check(): pytest.skip( " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 2, "contiguous_gradients": True, "allgather_bucket_size": 2000000000, "reduce_bucket_size": 200000000, "overlap_comm": False, "reduce_scatter": False }, "fp16": { "enabled": False }, "bfloat16": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim) @distributed_test(world_size=[2]) def _helper(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.bfloat16) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _helper(args=args, model=model, hidden_dim=hidden_dim)
def test_onebitlamb_fp16_pipeline(topo, tmpdir): config_dict = { "train_batch_size": 16, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 20, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00001, "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7, "freeze_step": 200, "cuda_aware": False, "comm_backend_name": "nccl" } }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 0 }, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 }, "pipeline": { "seed_layers": True, "activation_checkpoint_interval": 1 } } args = args_from_dict(tmpdir, config_dict) # Allocate model for consistent initial weights. init_net = AlexNetPipe() @distributed_test(world_size=4) def _helper(topo, tmpdir, steps=500): assert steps >= 100 test_net = copy.deepcopy(init_net) test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar(test_model, args, num_steps=steps, fp16=config_dict['fp16']['enabled']) _helper(topo, tmpdir)
def test_three_output_model(tmpdir): gradient_accumulation_steps = 3 micro_batch_size = 1 world_size = 1 config_dict = create_config_dict(micro_batch_size, gradient_accumulation_steps, world_size) hidden_dim = 10 weight_value = 0.1 args = args_from_dict(tmpdir, config_dict) model = MultiOutputModel(hidden_dim, weight_value) @distributed_test(world_size=[1]) def _test_three_output_model(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) total_samples = gradient_accumulation_steps * micro_batch_size * 2 data_loader = multi_output_dataloader(model=model, total_samples=total_samples, hidden_dim=hidden_dim, device=model.device, inputs=[1.0, 2.0, 3.0], targets=[1, 2, 3]) for n, batch in enumerate(data_loader): assert len(batch) % 2 == 0, \ f"multi_output_dataloader failed to return even number of data samples (input+target)" midpoint = len(batch) // 2 inputs, targets = batch[:midpoint], batch[midpoint:] loss_tuple = model(inputs, targets) assert len(loss_tuple) == 3 expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device) for loss in loss_tuple: assert loss.shape == torch.Size([]) assert loss.item() == approx(expected_loss.item()) summed_loss = sum(loss_tuple) scaled_loss = model.backward(summed_loss) expected_scaled_loss = summed_loss.float( ) / gradient_accumulation_steps assert scaled_loss.item() == approx(expected_scaled_loss.item()) model.step() _test_three_output_model(args=args, model=model, hidden_dim=hidden_dim)
def test_pld_model(tmpdir, theta): gamma = 0.001 config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": 'Adam', "params": { "lr": 0.0001 } }, "fp16": { "enabled": True }, "progressive_layer_drop": { "enabled": True, "theta": theta, "gamma": gamma } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = PLD_SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_pld_model(args, model, hidden_dim, theta, gamma): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() expected_theta = (1. - theta) * np.exp(-gamma * i) + theta actual_theta = model.get_pld_theta() assert expected_theta == actual_theta _test_pld_model(args=args, model=model, hidden_dim=hidden_dim, theta=theta, gamma=gamma)
def test_adam_fp16_onecycle_compatibility(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "scheduler": { "type": "OneCycle", "params": { "cycle_first_step_size": 16000, "cycle_first_stair_count": 8000, "decay_step_size": 16000, "cycle_min_lr": 1e-06, "cycle_max_lr": 3e-05, "decay_lr_rate": 1e-07, "cycle_min_mom": 0.85, "cycle_max_mom": 0.99, "decay_mom_rate": 0.0 } }, "fp16": { "enabled": True }, "zero_optimization": False } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) @distributed_test(world_size=[1]) def _test_adam_fp16_onecycle_compatibility(args, model, hidden_dim): model, _, _,_ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_adam_fp16_onecycle_compatibility(args=args, model=model, hidden_dim=hidden_dim)
def test_flops_profiler_in_ds_trainning(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.001, } }, "zero_optimization": { "stage": 0 }, "fp16": { "enabled": True, }, "flops_profiler": { "enabled": True, "start_step": 2, "end_step": 3, "module_depth": -1, "top_modules": 3, }, } args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.half) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if n == 3: break assert model.flops_profiler.flops == 100 assert model.flops_profiler.params == 110 _test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
def test_moe(tmpdir, ep_size): if not required_torch_version(): pytest.skip( "DeepSpeed MoE tests need torch 1.8 or higher to run correctly") config_dict = { "train_batch_size": 8, "steps_per_print": 1, "fp16": { "enabled": True } } args = args_from_dict(tmpdir, config_dict) hidden_dim = 16 @distributed_test(world_size=[4]) def _test_moe(args, hidden_dim, ep_size): # E+D -- ep_size = 2 # E only -- ep_size = 4 #groups.initialize_model_parallel(1) #groups.initialize_expert_parallel(2) groups.initialize(ep_size=ep_size) model = SimpleMoEModel(hidden_dim) optimizer = torch.optim.AdamW(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, dist_init_required=False) #dist_init_required=False -- parameterize to True/False? assert dist.get_world_size() == groups.get_data_parallel_world_size( ), "incorrect data parallel world size" assert ep_size == groups.get_expert_parallel_world_size( ), "incorrect expert parallel world size" data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() _test_moe(args=args, hidden_dim=hidden_dim, ep_size=ep_size)
def get_deepspeed_model(self, model, tmpdir): ds_config_dict = { "train_micro_batch_size_per_gpu": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, } ds_args = args_from_dict(tmpdir, ds_config_dict) dist.barrier() model, _, _, _ = deepspeed.initialize( args=ds_args, model=model, model_parameters=model.parameters()) return model.cuda()