def test_checkpoint_mode_when_chunks_1(): model = nn.Sequential(nn.Linear(1, 1)) # All checkpoint modes are fine. Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="except_last") Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="always") Pipe(model, balance=[1], devices=["cpu"], chunks=1, checkpoint="never")
def test_simple_linears(): def sum_grad(parameters): return sum([p.grad.sum() for p in parameters if p.grad is not None]) def zero_grad(parameters): for p in parameters: p.grad = None inputs = torch.rand(8, 1) model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),) # Without Pipe outputs = model(inputs) loss = outputs.mean() loss.backward() grad_without_pipe = sum_grad(model.parameters()) zero_grad(model.parameters()) # With Pipe model = Pipe(model, [2, 2], devices=["cpu", "cpu"], chunks=4) outputs = model(inputs) loss = outputs.mean() loss.backward() grad_with_pipe = sum_grad(model.parameters()) # Both grads should be identical. assert torch.allclose(grad_with_pipe, grad_without_pipe)
def test_input_singleton(setup_rpc): class One(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(1, 1) def forward(self, only_a): (a,) = only_a return (self.fc(a),) model = nn.Sequential(One()) model = Pipe(model, chunks=2) a = torch.rand(10, 1, requires_grad=True) (a_out,) = model((a,)).local_value() loss = a_out.mean() loss.backward() assert all(p.grad is not None for p in model.parameters()) assert a.grad is not None # Test with list a.grad = None for p in model.parameters(): p.grad = None (a_out,) = model([a]).local_value() loss = a_out.mean() loss.backward() assert all(p.grad is not None for p in model.parameters()) assert a.grad is not None
def test_checkpoint_mode(): def count_grad_fn(grad_fn, name, visited=None): if visited is None: visited = set() if grad_fn in visited: return 0 visited.add(grad_fn) if grad_fn is None: return 0 if grad_fn.__class__.__name__ == name: return 1 counter = 0 for next_grad_fn, _ in grad_fn.next_functions: counter += count_grad_fn(next_grad_fn, name, visited=visited) return counter model = nn.Sequential(nn.Linear(1, 1)) input = torch.rand(2, 1) always = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="always") except_last = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="except_last") never = Pipe(model, balance=[1], devices=["cpu"], chunks=2, checkpoint="never") always_output = always(input) except_last_output = except_last(input) never_output = never(input) assert count_grad_fn(always_output.grad_fn, "CheckpointBackward") == 2 assert count_grad_fn(except_last_output.grad_fn, "CheckpointBackward") == 1 assert count_grad_fn(never_output.grad_fn, "CheckpointBackward") == 0
def test_chunks_less_than_1(): model = nn.Sequential(nn.Linear(1, 1)) with pytest.raises(ValueError): Pipe(model, chunks=0) with pytest.raises(ValueError): Pipe(model, chunks=-1)
def test_chunks_less_than_1(): model = nn.Sequential(nn.Linear(1, 1)) with pytest.raises(ValueError): Pipe(model, balance=[1], devices=["cpu"], chunks=0) with pytest.raises(ValueError): Pipe(model, balance=[1], devices=["cpu"], chunks=-1)
def test_balance_less_than_1(): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) with pytest.raises(ValueError): Pipe(model, balance=[0, 2]) with pytest.raises(ValueError): Pipe(model, balance=[-1, 3])
def test_balance_wrong_length(): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) with pytest.raises(ValueError): Pipe(model, balance=[1]) with pytest.raises(ValueError): Pipe(model, balance=[3])
def test_recommend_auto_balance(): with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"): # balance is required Pipe(nn.Sequential()) with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"): # module and sum of balance have differen length (module: 0, sum of balance: 1) Pipe(nn.Sequential(), [1]) with pytest.raises(ValueError, match="torch.distributed._pipeline.sync.balance"): # module and sum of balance have different length (module: 2, sum of balance: 1) Pipe(nn.Sequential(nn.Linear(1, 1), nn.Linear(1, 1)), [1])
def test_partitions(): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(a, b) model = Pipe(model, [1, 1], devices=["cpu", "cpu"]) assert isinstance(model.partitions, nn.ModuleList) assert isinstance(model.partitions[0], nn.Sequential) assert isinstance(model.partitions[1], nn.Sequential) assert "partitions.0.0.weight" in model.state_dict()
def test_named_children(): a = nn.Linear(1, 1) b = nn.Linear(1, 1) model = nn.Sequential(OrderedDict([("a", a), ("b", b)])) model = Pipe(model, [1, 1], devices=["cpu", "cpu"]) names = set(n for n, _ in model.named_modules()) assert "partitions.0.a" in names assert "partitions.1.b" in names # Pipe doesn't support __getattr__. Unlike nn.Sequential, Pipe requires # several methods in its namespace. with pytest.raises(AttributeError): model.a
def test_input_pair(setup_rpc): class Two(nn.Module): def __init__(self): super().__init__() self.fc_a = nn.Linear(1, 1) self.fc_b = nn.Linear(1, 1) def forward(self, a_and_b): a, b = a_and_b return (self.fc_a(a), self.fc_b(b)) model = nn.Sequential(Two()) model = Pipe(model, chunks=2) a = torch.rand(10, 1, requires_grad=True) b = torch.rand(10, 1, requires_grad=True) a_out, b_out = model((a, b)).local_value() loss = (a_out + b_out).mean() loss.backward() assert a.grad is not None assert b.grad is not None # Test with list. a.grad = None b.grad = None a_out, b_out = model([a, b]).local_value() loss = (a_out + b_out).mean() loss.backward() assert a.grad is not None assert b.grad is not None
def test_python_autograd_function(): # A Python autograd function might fail with this error: # # RuntimeError: Returning Variables sharing storage with other Variables # that require grad is not supported in Python functions. Please submit a # feature request if you hit this error. # # It doesn't look like an essential restriction. But it happens on the # current PyTorch version. To avoid it, we should detach the tensor before # returning by identity autograd functions, such as Wait, Fork, and Join. # class Identity(torch.autograd.Function): @staticmethod def forward(ctx, input): return input @staticmethod def backward(ctx, grad): return grad class M(nn.Module): def forward(self, input): return Identity.apply(input) model = nn.Sequential(M(), M()) model = Pipe(model, [1, 1], devices=["cpu", "cpu"], checkpoint="always") x = torch.rand(42) y = model(x) assert torch.allclose(x, y)
def test_exception_early_stop_asap(): """Even the first partitions have finished to process, the partition before the failed partition should be killed as soon as possible. """ class ExpectedException(Exception): pass class Pass(nn.Module): def forward(self, x): return x counter = 0 class Counter(nn.Module): def forward(self, x): time.sleep(0.1) nonlocal counter counter += 1 return x class Raise(nn.Module): def forward(self, x): raise ExpectedException() model = nn.Sequential(Pass(), Pass(), Counter(), Raise()) model = Pipe(model, [1, 1, 1, 1], devices=["cpu", "cpu", "cpu", "cpu"], chunks=3) with pytest.raises(ExpectedException): model(torch.rand(3)) # If the early stop doesn't work, it would be 3 instead. assert counter == 2
def test_inplace_incorrect_grad(): class M(nn.Module): def forward(self, foo_bar): # 'foo' requires grad but 'bar' does not. In-place operation on # 'bar' won't cause a RuntimeError. foo, bar = foo_bar # add_(1) is not idempotent, in contrast to relu_(). If it is # executed multiple times, it will accumulates each difference onto # 'bar'. bar.add_(1) # 'bar' is still captured by checkpointing. 'foo' will get # incorrect grad. return foo * bar model = nn.Sequential(M()) model = Pipe(model, [1], devices=["cpu"], checkpoint="always") foo = torch.tensor([1.0], requires_grad=True) bar = torch.tensor([1.0]) output = model((foo, bar)) del model output.backward() # The gradient of 'foo' should be 2, but it is 3 actually because # bar.add_(1) was executed twice due to checkpointing. assert foo.grad.item() == 2.0
def test_forward_lockstep(): timeline = [] class DelayedLog(nn.Module): def __init__(self, j, seconds): super().__init__() self.i = 0 self.j = j self.seconds = seconds def forward(self, x): time.sleep(self.seconds) timeline.append((self.i, self.j)) self.i += 1 return x model = nn.Sequential(DelayedLog(0, seconds=0), DelayedLog(1, seconds=0.1)) model = Pipe(model, balance=[1, 1], devices=["cpu", "cpu"], chunks=3) model(torch.rand(3, 1)) # Expected timeline: (Logs are recorded at !) # # Partition #0: 0! 1! 2! # Partition #1: 000! 111! 222! # assert timeline == [(0, 0), (1, 0), (0, 1), (2, 0), (1, 1), (2, 1)]
def test_batch_size_indivisible(): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, balance=[1], devices=["cpu"], chunks=4) with pytest.warns(None) as record: model(torch.rand(7, 1)) # Indivisible batch size is legal. assert not record
def test_batch_size_indivisible(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=4) with pytest.warns(None) as record: model(torch.rand(7, 1)) # Indivisible batch size is legal. assert not record
def test_batch_size_small(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=4) with pytest.warns(None) as record: model(torch.rand(2, 1)) # Batch size smaller than chunks is legal. assert not record
def test_batch_size_small(): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, balance=[1], devices=["cpu"], chunks=4) with pytest.warns(None) as record: model(torch.rand(2, 1)) # Batch size smaller than chunks is legal. assert not record
def test_input_varargs(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model) a = torch.rand(1) b = torch.rand(1) # TypeError: forward() takes 2 positional arguments but 3 were given with pytest.raises(TypeError): model(a, b)
def test_input_varargs(): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, balance=[1], devices=["cpu"]) a = torch.rand(1) b = torch.rand(1) # TypeError: forward() takes 2 positional arguments but 3 were given with pytest.raises(TypeError): model(a, b)
def test_inplace_on_requires_grad(): model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True)) model = Pipe(model, checkpoint="always") x = torch.rand(1) y = model(x) message = r"a leaf Variable that requires grad .* used in an in-place operation." with pytest.raises(RuntimeError, match=message): y.backward()
def test_verify_module_duplicate_parameters_on_same_device(): class Surrogate(nn.Module): def __init__(self, module): super().__init__() self.module = module conv = nn.Conv2d(3, 3, 1) model = nn.Sequential(Surrogate(conv), Surrogate(conv)) Pipe(model, [1, 1], devices=["cpu", "cpu"])
def test_empty_module(): # Empty sequential module is not illegal. model = nn.Sequential() model = Pipe(model, []) assert model(torch.tensor(42)) == torch.tensor(42) assert model((torch.tensor(42),)) == (torch.tensor(42),) # But only tensor or tensors is legal in Pipe. with pytest.raises(TypeError): model(42)
def test_verify_module_duplicate_parameters_on_distinct_devices(): class Surrogate(nn.Module): def __init__(self, module): super().__init__() self.module = module conv = nn.Conv2d(3, 3, 1) model = nn.Sequential(Surrogate(conv), Surrogate(conv)) with pytest.raises(ValueError, match="module with duplicate parameters on distinct devices is not supported"): Pipe(model, [1, 1], devices=["cpu", "cuda"])
def test_devices(): a = nn.Linear(1, 1) b = nn.Linear(1, 1) c = nn.Linear(1, 1) # There are extra two devices. model = nn.Sequential(a, b, c) model = Pipe(model) cpu = torch.device("cpu") # Extra devices must be discarded. assert model.devices == [cpu, cpu, cpu]
def test_input_singleton(): class One(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(1, 1) def forward(self, only_a): (a,) = only_a return (self.fc(a),) model = nn.Sequential(One()) model = Pipe(model, balance=[1], devices=["cpu"], chunks=2) a = torch.rand(10, 1, requires_grad=True) (a_out,) = model((a,)) loss = a_out.mean() loss.backward() assert all(p.grad is not None for p in model.parameters()) assert a.grad is not None
def test_1to3(balance, checkpoint): if torch.cuda.device_count() < len(balance): pytest.skip("at least %d cuda devices required" % len(balance)) @skippable(stash=["1to3"]) class Layer1(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): yield stash("1to3", input) output = self.conv(input) return output # noqa class Layer2(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): output = self.conv(input) return output @skippable(pop=["1to3"]) class Layer3(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv2d(3, 3, 1) def forward(self, input): skip_1to3 = yield pop("1to3") output = self.conv(input) + skip_1to3 return output model = nn.Sequential(Layer1(), Layer2(), Layer3()) model = convert_to_balance(model, balance) model = Pipe(model, chunks=3, checkpoint=checkpoint) in_device = model.devices[0] out_device = model.devices[-1] input = torch.rand(30, 3, 224, 224, device=in_device, requires_grad=True) output = model(input) loss = output.mean() loss.backward() assert torch.allclose(output.norm(), torch.tensor(1039.0, device=out_device), atol=6e-1) assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053, device=in_device))
def test_tuple_wait(cuda_sleep): # In v0.0.3, Wait is applied to only the first tensor on a micro-batch. # Under this behavior, if checkpointing was disabled, there's a possibility # that gradient accumulations on other tensors are not synchronized # properly to the copy stream. class Sleep(torch.autograd.Function): @staticmethod def forward(ctx, x): return x.detach() @staticmethod def backward(ctx, grad): with torch.cuda.device(grad.device): cuda_sleep(0.05) return grad class Layer1(nn.Module): def __init__(self): super().__init__() self.ones = nn.Parameter( torch.ones(32, 3, 32, 32, requires_grad=True)) def forward(self, pair): a, b = pair a = a * self.ones return a * 1, b * 2, b * 3 class Layer2(nn.Module): def __init__(self): super().__init__() self.ones = nn.Parameter( torch.ones(32, 3, 32, 32, requires_grad=True)) def forward(self, triple): a, b, c = triple a = a * self.ones b = Sleep.apply(b) return a + b + c model = nn.Sequential(Layer1().cuda(0), Layer2().cuda(1)) model = Pipe(model, chunks=32, checkpoint="never") a = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True) b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True) y = model((a, b)) y.norm().backward() torch.cuda.synchronize(0) torch.cuda.synchronize(1) assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000))