def _helper(topo, tmpdir, steps=500): assert steps >= 100 base_net = copy.deepcopy(init_net) base_model = PipelineModule(layers=base_net.to_layers(), num_stages=1, loss_fn=nn.CrossEntropyLoss()) # Train with just data parallelism base_losses = train_cifar(base_model, args, num_steps=steps, fp16=config_dict['fp16']['enabled']) test_net = copy.deepcopy(init_net) test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) #test_model = AlexNetPipe(num_classes=10, # topology=test_topo, # seed_layers=config_dict['pipeline']['seed_layers']) test_losses = train_cifar(test_model, args, num_steps=steps, fp16=config_dict['fp16']['enabled']) abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)] rel_diffs = [ rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses) ] if dist.get_rank() == 0: print( f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}' ) print( f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}' ) print( f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}' ) for lastX in [1, 10, 100]: base_avg = sum(base_losses[-lastX:]) / lastX test_avg = sum(test_losses[-lastX:]) / lastX print( f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}' ) lastX = 100 base = base_losses[-lastX:] base_avg = sum(base) / len(base) test = test_losses[-lastX:] test_avg = sum(test) / len(test) assert rel_diff(base_avg, test_avg) < 0.03
def _helper(topo, tmpdir, steps=500): assert steps >= 100 test_net = copy.deepcopy(init_net) test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar(test_model, args, num_steps=steps, fp16=config_dict['fp16']['enabled'])
def test(self, topo_config): config_dict = { "train_batch_size": 16, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 20, "optimizer": { "type": "ZeroOneAdam", "params": { "lr": 0.00001, "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7, "var_freeze_step": 4, "var_update_scaler": 1, "local_step_scaler": 1, "local_step_clipper": 2, "cuda_aware": False, "comm_backend_name": "nccl", }, }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 0 }, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 }, "pipeline": { "seed_layers": True, "activation_checkpoint_interval": 1 }, } topo = PipeTopo(**topo_config) steps = 500 # Must be >=100 # Allocate model for consistent initial weights. init_net = AlexNetPipe() test_net = copy.deepcopy(init_net) test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar( test_model, config=config_dict, num_steps=steps, fp16=config_dict["fp16"]["enabled"], )
def test(self, topo_config): config_dict = { "train_batch_size": 16, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 20, "optimizer": { "type": "Adam", "params": { "lr": 0.001, "betas": [0.9, 0.999], "eps": 1e-8, "weight_decay": 3e-7 } }, "zero_optimization": { "stage": 0 }, "fp16": { "enabled": False }, "pipeline": { "seed_layers": True, "activation_checkpoint_interval": 1 } } topo = PipeTopo(**topo_config) steps = 500 # must be >=100 # Allocate model for consistent initial weights. init_net = AlexNetPipe() base_net = copy.deepcopy(init_net) base_model = PipelineModule(layers=base_net.to_layers(), num_stages=1, loss_fn=nn.CrossEntropyLoss()) # Train with just data parallelism base_losses = train_cifar(base_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled']) test_net = copy.deepcopy(init_net) test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss()) test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled']) abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)] rel_diffs = [ rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses) ] if dist.get_rank() == 0: print( f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}' ) print( f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}' ) print( f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}' ) for lastX in [1, 10, 100]: base_avg = sum(base_losses[-lastX:]) / lastX test_avg = sum(test_losses[-lastX:]) / lastX print( f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}' ) lastX = 100 base = base_losses[-lastX:] base_avg = sum(base) / len(base) test = test_losses[-lastX:] test_avg = sum(test) / len(test) assert rel_diff( base_avg, test_avg ) < 0.05 # Originally 0.03, but seeing instability with AMD results