def none_skip(pipeline_style): if pipeline_style == MultiProcessPipe.AsyncSchedule: pytest.skip("Skip tensors NYI for AsyncSchedule") @skippable(stash=["none"]) class Stash(nn.Module): def forward(self, input): yield stash("none", None) return input @skippable(pop=["none"]) class Pop(nn.Module): def forward(self, input): none = yield pop("none") assert none is None return input model = nn.Sequential(Stash(), Pop()) model = MultiProcessPipe( model, [1, 1], style=pipeline_style, worker_map=get_worker_map(), input_device=torch.cuda.current_device(), chunks=5, ).cuda() input = torch.rand(10, requires_grad=True).cuda() input.retain_grad() output = model(input) def assert_grad_fn_is_not_portal(grad_fn, visited=set()): if grad_fn in visited or grad_fn is None: return assert not isinstance(grad_fn, PortalBlue._backward_cls) assert not isinstance(grad_fn, PortalCopy._backward_cls) assert not isinstance(grad_fn, PortalOrange._backward_cls) visited.add(grad_fn) for next_grad_fn, _ in grad_fn.next_functions: assert_grad_fn_is_not_portal(next_grad_fn, visited) if model.group.rank() == 1: assert_grad_fn_is_not_portal(output.grad_fn) output.sum().backward() else: model.back_helper(output) assert input.grad.mean().item() == 1
def tuple_wait(cuda_sleep, pipeline_style): # In v0.0.3, Wait is applied to only the first tensor on a micro-batch. # Under this behavior, if checkpointing was disabled, there's a possibility # that gradient accumulations on other tensors are not synchronized # properly to the copy stream. class Sleep(torch.autograd.Function): @staticmethod def forward(ctx, x): return x.detach() @staticmethod def backward(ctx, grad): with torch.cuda.device(grad.device): cuda_sleep(0.05) return grad class Layer1(nn.Module): def forward(self, pair): a, b = pair return a * 1, b * 2, b * 3 class Layer2(nn.Module): def forward(self, triple): a, b, c = triple b = Sleep.apply(b) return a + b + c model = nn.Sequential(Layer1(), Layer2()) model = MultiProcessPipe( model, [1, 1], style=pipeline_style, worker_map=get_worker_map(), input_device=torch.cuda.current_device(), chunks=32, checkpoint="never", ).cuda() a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True) b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True) y = model((a, b)) if model.group.rank() == 1: y.norm().backward() else: model.back_helper(y) if model.group.rank() == 0: assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))
def partitions(pipe_class): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) model = pipe_class(model, [1, 1], worker_map=get_worker_map()) assert isinstance(model.partitions, list) assert len(model) == 1 assert isinstance(model.partitions[0].module, nn.Sequential) if model.group.rank() == 0: assert "0.0.weight" in model.state_dict() else: assert "0.1.weight" in model.state_dict()
def async_event_loop(): model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()) pipe = MultiProcessPipe(model, [1, 1, 1, 1], style=MultiProcessPipe.AsyncSchedule, worker_map=get_worker_map(), chunks=10) inputs = torch.rand(100, 10) output = pipe(inputs) if pipe.final_stage: loss = output.mean() loss.backward()
def checkpoint_mode_invalid(pipeline_style): model = nn.Sequential(nn.Linear(1, 1)) with pytest.raises( ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'" ): MultiProcessPipe( model, balance=[1], style=pipeline_style, worker_map=get_worker_map(), chunks=2, checkpoint="INVALID_CHECKPOINT", )
def checkpoint_mode_when_chunks_1(pipeline_style): model = nn.Sequential(nn.Linear(1, 1)) # All checkpoint modes are fine. MultiProcessPipe( model, balance=[1], style=pipeline_style, worker_map=get_worker_map(), chunks=1, checkpoint="except_last", ) MultiProcessPipe(model, balance=[1], style=pipeline_style, worker_map=get_worker_map(), chunks=1, checkpoint="always") MultiProcessPipe(model, balance=[1], style=pipeline_style, worker_map=get_worker_map(), chunks=1, checkpoint="never")
def async_event_loop_interleave_hard(): model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10), nn.Linear(10, 10), nn.Linear(10, 10)) pipe = AMPnetPipe( module=model, balance=[1, 1, 1, 1], worker_map=get_worker_map(), chunks=10, checkpoint="never", ) fake_dataset = FakeDataset() fake_dataloader = DataLoader(fake_dataset, batch_size=4, shuffle=True, num_workers=0) loss = nn.MSELoss() opt = MySGD(model.parameters(), lr=0.01) transform_and_log = AMPnetDelegate() pipe.interleave(fake_dataloader, loss, opt, transform_and_log)
def checkpoint_mode(pipe_class): def count_grad_fn(grad_fn, name, visited=set()): if grad_fn in visited: return 0 visited.add(grad_fn) if grad_fn is None: return 0 if grad_fn.__class__.__name__ == name: return 1 counter = 0 for next_grad_fn, _ in grad_fn.next_functions: counter += count_grad_fn(next_grad_fn, name, visited=visited) return counter model = nn.Sequential(nn.Linear(1, 1)) input = torch.rand(2, 1) always = pipe_class( model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="always", pipelined_backward=False, ) except_last = pipe_class( model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="except_last", pipelined_backward=False, ) never = pipe_class( model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="never", pipelined_backward=False, ) always_output = always(input) except_last_output = except_last(input) never_output = never(input) assert count_grad_fn(always_output.grad_fn, "CheckpointBackward") == 2 assert count_grad_fn(except_last_output.grad_fn, "CheckpointBackward") == 1 assert count_grad_fn(never_output.grad_fn, "CheckpointBackward") == 0
def simple_linears(pipe_class): def sum_grad(parameters): return sum([p.grad.sum() for p in parameters if p.grad is not None]) def zero_grad(parameters): for p in parameters: p.grad = None set_random_seed(12345) inputs = torch.rand(8, 1) model = nn.Sequential( nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1), ) # Without MultiProcessPipe outputs = model(inputs) loss = outputs.mean() loss.backward() grad_without_pipe = [ sum_grad([*model[0].parameters(), *model[1].parameters()]), sum_grad([*model[2].parameters(), *model[3].parameters()]), ] ref_without_pipe = [p.grad for p in model.parameters()] zero_grad(model.parameters()) model = pipe_class(model, [2, 2], worker_map=get_worker_map(), chunks=4) outputs = model(inputs) if model.group.rank() == 1: loss = outputs.mean() loss.backward() grad_with_pipe = sum_grad(model.partition.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe[1]) else: model.back_helper(outputs) grad_with_pipe = sum_grad(model.partition.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe[0]) torch.distributed.barrier()
def verify_module_duplicate_parameters_on_distinct_partitions(pipe_class): class Surrogate(nn.Module): def __init__(self, module): super().__init__() self.module = module conv = nn.Conv2d(3, 3, 1) model = nn.Sequential(Surrogate(conv), Surrogate(conv)) # FIXME(tom) can't have duplicate params with separate processes with pytest.raises( ValueError, match= "module with duplicate parameters on distinct devices is not supported" ): pipe_class(model, [1, 1], worker_map=get_worker_map())
def exception(pipe_class): class ExpectedException(Exception): pass class Raise(nn.Module): def forward(self, *_): raise ExpectedException() model = nn.Sequential(Raise()) model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=1) with pytest.raises(ExpectedException): model(torch.rand(1))
def non_tensor(pipe_class): class NonTensor(nn.Module): def forward(self, _): return "hello" model = nn.Sequential(NonTensor()) model = pipe_class(model, balance=[1], worker_map=get_worker_map()) x = torch.rand(1) # TypeError: expected Tensor as element 0 in argument 0, but got str with pytest.raises(TypeError): model(x) # TypeError: expected Tensor to scatter, but got str with pytest.raises(TypeError): model("hello")
def public_attrs(pipe_class): model = nn.Sequential(nn.Linear(1, 1)) pipe = pipe_class( model, balance=(1, ), worker_map=get_worker_map(), chunks=42, checkpoint="always", ) assert pipe.balance == [1] assert pipe.chunks == 42 assert isinstance(pipe.chunks, int) assert pipe.checkpoint == "always" assert isinstance(pipe.checkpoint, str)
def non_tensor_tuple(pipe_class): class NonTensorTuple(nn.Module): def forward(self, x): return (x, "hello") model = nn.Sequential(NonTensorTuple()) model = pipe_class(model, balance=[1], worker_map=get_worker_map()) x = torch.rand(1) # TypeError: CheckpointBackward.forward: expected Variable (got str) for return value 1 with pytest.raises(TypeError): model(x) # TypeError: expected Tensor to scatter, but got str with pytest.raises(TypeError): model((x, "hello"))
def public_attrs(pipe_class): class MyString: def __init__(self, value): self.value = value def __str__(self): return self.value model = nn.Sequential(nn.Linear(1, 1)) pipe = pipe_class(model, balance=(1,), worker_map=get_worker_map(), chunks=42.000, checkpoint=MyString("always"),) assert pipe.balance == [1] assert pipe.chunks == 42 assert isinstance(pipe.chunks, int) assert pipe.checkpoint == "always" assert isinstance(pipe.checkpoint, str)
def named_children(pipe_class): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(OrderedDict([("a", a), ("b", b)])) model = pipe_class(model, [1, 1], worker_map=get_worker_map()) names = set(n for n, _ in model.named_modules()) if model.group.rank() == 0: assert "0.a" in names else: assert "0.b" in names # MultiProcessPipe doesn't support __getattr__. Unlike nn.Sequential, MultiProcessPipe requires # several methods in its namespace. with pytest.raises(AttributeError): model.a
def inplace_on_not_requires_grad(pipe_class): # In-place operation on a tensor not requiring grad doesn't cause a # RuntimeError. Currently, we cannot detect this case. model = nn.Sequential(nn.ReLU(inplace=True)) model = pipe_class(model, [1], worker_map=get_worker_map(), checkpoint="always") x = torch.rand(1) y = model(x) del model message = r"a leaf Variable that requires grad .* used in an in-place operation." with pytest.raises(RuntimeError, match=message): y.backward() torch.distributed.barrier()
def check_pipe_against_reference(balance, model_constructor, checkpoint="except_last", custom_inputs=None): model = model_constructor() reference_model = model_constructor() for src, dst in zip(model, reference_model): dst.load_state_dict(copy.deepcopy(src.state_dict())) reference_model = nn.Sequential(*reference_model).cuda() pipe = PipeRPCWrapper( model, balance, input_device=torch.cuda.current_device(), worker_map=get_worker_map(), checkpoint=checkpoint, ) pipe.foreach_worker(register_optimizer, include_self=True) register_optimizer(None, reference_model) inputs = torch.rand(10).cuda() target = torch.rand(10).cuda() cloned = inputs.clone() output = pipe(inputs) ref_out = reference_model(inputs) assert torch.equal(ref_out.cpu(), output.cpu()) for out in output, ref_out: target = target.to(out.device) loss = nn.MSELoss()(out, target) loss.backward() pipe.foreach_worker(step_optimizer, include_self=True) step_optimizer(None, reference_model.cuda()) pipe.eval() reference_model.eval() final_output = pipe(inputs) final_ref = reference_model(inputs.cuda()) assert torch.equal(final_output.cpu(), final_ref.cpu())
def async_event_loop_interleave_simple(): model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(inplace=False), nn.Linear(10, 10), nn.ReLU(inplace=False)) pipe = AMPnetPipe( module=model, balance=[2, 2], worker_map=get_worker_map(), chunks=10, checkpoint="never", ) fake_dataset = FakeDataset() fake_dataloader = DataLoader(fake_dataset, batch_size=4, shuffle=True, num_workers=0) loss = nn.MSELoss() opt = MySGD(model.parameters(), lr=0.01) pipe.interleave(fake_dataloader, loss, opt, 0)
def deferred_batch_norm(checkpoint, lazy, pipe_class): bn = nn.BatchNorm2d(3) pipe_bn = deepcopy(bn) pipe_fn = lambda: pipe_bn # noqa: E731 if lazy: model = [LazyModule(pipe_fn)] else: model = nn.Sequential(pipe_bn) pipe = pipe_class( model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint, deferred_batch_norm=True, ) x = torch.rand(4, 3, 10, 10) pipe(x).mean().backward() bn(x).mean().backward() assert torch.allclose(pipe[0].running_mean, bn.running_mean, atol=1e-4) assert torch.allclose(pipe[0].running_var, bn.running_var, atol=1e-4)
def rpc_reuse_in_final_stage(): # 'reused' and 'reused2' are located on stage 2, so the backward pass for # the final stage will need to first send gradients to stage 2, then receive # gradients from stage 2. This tests custom logic to handle reuse of layers # in the final stage of the pipeline. reused = nn.Linear(10, 10) reused2 = nn.Linear(10, 10) model = [ nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), reused2, nn.ReLU(), reused, nn.ReLU(), reused, reused2, nn.ReLU(), reused, nn.ReLU(), ] balance = [2, 3, 4] init_rpc() if torch.distributed.get_rank() != 0: rpc.shutdown() torch.distributed.barrier() return pipe = PipeRPCWrapper(model, balance, worker_map=get_worker_map()) inputs = torch.rand(10).cuda() target = torch.rand(10).cuda() output = pipe(inputs) nn.MSELoss()(output, target).backward() output = pipe(inputs) nn.MSELoss()(output, target).backward() rpc.shutdown() torch.distributed.barrier()
def deny_moving(pipe_class): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) model = pipe_class(model, [1, 1], worker_map=get_worker_map()) model.cuda() model.cpu() model.to(torch.device("cuda")) model.to(0) model.to("cuda") model.to(device=0) model.to(torch.rand(1)) model.to(tensor=torch.rand(1)) # Casting is allowed. model.half() model.to(torch.double) model.to(dtype=torch.float)
def inplace_on_requires_grad(pipe_class): model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True)) model = pipe_class(model, [1, 1], worker_map=get_worker_map(), checkpoint="always") x = torch.rand(1) if pipe_class == AsyncPipe and model.group.rank() == 0: # With AsyncPipe, model will wait forever for gradients if not eval model.eval() y = model(x) message = r"a leaf Variable that requires grad .* used in an in-place operation." if model.group.rank() == 1: with pytest.raises(RuntimeError, match=message): y.backward() torch.distributed.barrier()
def no_grad(pipe_class): model = nn.Sequential(nn.Linear(1, 1)) model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2) input = torch.rand(2, 1) latent = None def hook(module, input, output): _ = module _ = input nonlocal latent latent = output partition = model.partitions[0] partition.module.register_forward_hook(hook) with torch.no_grad(): model(input) assert latent.grad_fn is None
def input_singleton(pipe_class): class One(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(1, 1) def forward(self, only_a): (a,) = only_a return (self.fc(a),) model = nn.Sequential(One()) model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, pipelined_backward=False,) a = torch.rand(10, 1, requires_grad=True) (a_out,) = model((a,)) loss = a_out.mean() loss.backward() assert all(p.grad is not None for p in model.parameters()) assert a.grad is not None
def deferred_batch_norm_params(checkpoint, lazy, pipe_class): bn = nn.BatchNorm2d(3) pipe_bn = deepcopy(bn) pipe_fn = lambda: pipe_bn # noqa: E731 if lazy: model = [LazyModule(pipe_fn)] else: model = nn.Sequential(pipe_bn) pipe = pipe_class( model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint=checkpoint, deferred_batch_norm=True, ) x = torch.rand(4, 3, 10, 10) pipe(x).mean().backward() bn(x).mean().backward() assert pipe[0].weight.grad is not None assert pipe[0].bias.grad is not None assert torch.allclose(pipe[0].weight.grad, bn.weight.grad, atol=1e-4) assert torch.allclose(pipe[0].bias.grad, bn.bias.grad, atol=1e-4)
def python_autograd_function(pipe_class): # FIXME deadlock with AsyncPipe? # A Python autograd function might fail with this error: # # RuntimeError: Returning Variables sharing storage with other Variables # that require grad is not supported in Python functions. Please submit a # feature request if you hit this error. # # It doesn't look like an essential restriction. But it happens on the # current PyTorch version. To avoid it, we should detach the tensor before # returning by identity autograd functions, such as Wait, Fork, and Join. torch.manual_seed(0) class Identity(torch.autograd.Function): @staticmethod def forward(ctx, input): return input @staticmethod def backward(ctx, grad): return grad class M(nn.Module): def forward(self, input): return Identity.apply(input) model = nn.Sequential(M(), M()) model = pipe_class(model, [1, 1], worker_map=get_worker_map(), checkpoint="always").cuda() model.eval() x = torch.rand(42) y = model(x) if model.group.rank() == 1: assert torch.allclose(x, y) torch.distributed.rpc.shutdown() torch.distributed.barrier()
def sequential_like(balance, pipeline_style): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) model = MultiProcessPipe(model, balance, style=pipeline_style, worker_map=get_worker_map()) if balance == [2]: if torch.distributed.get_rank() == 0: assert len(model) == 2 assert list(model) == [a, b] assert model[0] is a assert model[1] is b with pytest.raises(IndexError): _ = model[2] assert model[-1] is b assert model[-2] is a else: assert len(model) == 0 assert list(model) == [] else: assert len(model) == 1 if torch.distributed.get_rank() == 0: assert list(model) == [a] assert model[0] is a assert model[-1] is a else: assert list(model) == [b] assert model[0] is b assert model[-1] is b with pytest.raises(IndexError): _ = model[1]
def input_pair(pipe_class): class Two(nn.Module): def __init__(self): super().__init__() self.fc_a = nn.Linear(1, 1) self.fc_b = nn.Linear(1, 1) def forward(self, a_and_b): a, b = a_and_b return (self.fc_a(a), self.fc_b(b)) model = nn.Sequential(Two()) model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, pipelined_backward=False,) a = torch.rand(10, 1, requires_grad=True) b = torch.rand(10, 1, requires_grad=True) a_out, b_out = model((a, b)) loss = (a_out + b_out).mean() loss.backward() assert a.grad is not None assert b.grad is not None
def checkpoint_non_float_input(pipe_class): class ForkNonFloat(nn.Module): def forward(self, input): return (input * 2, torch.tensor([False])) class JoinNonFloat(nn.Module): def forward(self, input): return input[0] * 2 model = nn.Sequential(ForkNonFloat(), JoinNonFloat()) model = pipe_class( model, balance=[1, 1], worker_map=get_worker_map(), chunks=1, checkpoint="always", pipelined_backward=False, ) input = torch.rand(1, requires_grad=True) output = model(input) if model.group.rank() == 1: # with torch.autograd.detect_anomaly(): output.backward() elif pipe_class == MultiProcessPipe: model.back_helper(output) torch.distributed.barrier()