def test_msd_gradients(): t.manual_seed(1) dtype = t.double size = (11, 13) batch_sz = 2 for depth in [9]: print(f"Depth: {depth}") width = c_in = c_out = batch_sz x = Variable(t.randn(batch_sz, c_in, *size, dtype=dtype)).cuda() x.requires_grad = True net = MSDModule(c_in, c_out, depth, width) net.double() for p in net.parameters(): p.data = t.randn_like(p.data) assert net is not None # o = net(x) # analytical, reentrant, correct_grad_sizes = get_analytical_jacobian((x,), o) # print(analytical) # print(f"Reentrant: {reentrant}") # print(correct_grad_sizes) # print(f"Net L shape: {net.L.shape}") gradcheck(net, [x], raise_exception=True)
def test_parameters_change(): # This test ensures that all parameters are updated after an # update step. t.manual_seed(1) size = (30, 30) for batch_sz in [1]: for depth in range(0, 20, 6): width = c_in = c_out = batch_sz x = Variable(t.randn(batch_sz, c_in, *size)).cuda() target = Variable(t.randn(batch_sz, c_out, *size)).cuda() assert x.data.is_cuda net = MSDModule(c_in, c_out, depth, width) assert net is not None params0 = dict((n, p.data.clone()) for n, p in net.named_parameters()) # Train for two iterations. The convolution weights in # the MSD layers are not updated after the first # training step because the final 1x1 convolution # weights are zero. optimizer = optim.Adam(net.parameters()) optimizer.zero_grad() for _ in range(2): y = net(x) assert y is not None criterion = nn.L1Loss() loss = criterion(y, target) loss.backward() optimizer.step() params1 = dict(net.named_parameters()) for name in params1.keys(): p0, p1 = params0[name], params1[name] d = abs(p0 - p1.data.clone()).sum().item() assert 0.0 < d, ( f"Parameter {name} left unchanged: \n" f"Initial value: {p0}\n" f"Current value: {p1}\n" f"Gradient: {p1.grad}\n" ) # Check that the loss is not zero assert loss.abs().item() != approx(0.0)
def test_msd_gradients(): t.manual_seed(1) dtype = t.double size = (11, 13) batch_sz = 2 for depth in [9]: print(f"Depth: {depth}") width = c_in = c_out = batch_sz x = Variable(t.randn(batch_sz, c_in, *size, dtype=dtype)).cuda() x.requires_grad = True net = MSDModule(c_in, c_out, depth, width).cuda() net.double() # The weights of the final layer are initialized to zero by # default. This makes it trivial to pass gradcheck. Therefore, # we reinitialize all weights randomly. for p in net.parameters(): p.data = t.randn_like(p.data) gradcheck(net, [x], raise_exception=True, atol=1e-4, rtol=1e-3)