testcase_automator( testcase_fused, { 'B': [1, 5, 8], 'lr': [ [0.5, 1.0, 2.0], (0.5, 1.0, 2.0), np.array([0.5, 1.0, 2.0]), torch.as_tensor([0.5, 1.0, 2.0], dtype=torch.float), [1.0, 1.0, 1.0], (1.0, 1.0, 1.0), np.array([1.0, 1.0, 1.0]), torch.as_tensor([1.0, 1.0, 1.0], dtype=torch.float), ], 'rho': [ [0.1, 0.9, 0.99], (0.1, 0.9, 0.99), np.array([0.1, 0.9, 0.99]), torch.as_tensor([0.1, 0.9, 0.99], dtype=torch.float), [0.9, 0.9, 0.9], (0.9, 0.9, 0.9), np.array([0.9, 0.9, 0.9]), torch.as_tensor([0.9, 0.9, 0.9], dtype=torch.float), ], 'eps': [ [1e-6, 1e-5, 1e-7], (1e-6, 1e-5, 1e-7), np.array([1e-6, 1e-5, 1e-7]), torch.as_tensor([1e-6, 1e-5, 1e-7], dtype=torch.float), [1e-6, 1e-6, 1e-6], (1e-6, 1e-6, 1e-6), np.array([1e-6, 1e-6, 1e-6]), torch.as_tensor([1e-6, 1e-6, 1e-6], dtype=torch.float), ], 'weight_decay': [ [0.1, 0.03, 0.0], (0.1, 0.03, 0.0), np.array([0.1, 0.03, 0.0]), torch.as_tensor([0.1, 0.03, 0.0], dtype=torch.float), [0.0, 0.0, 0.0], (0, 0, 0), np.array([0, 0, 0]), torch.as_tensor([0.0, 0.0, 0.0], dtype=torch.float), 0.3, 0.0, ], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, )
testcase_automator( testcase_fused, { 'B': [1, 5, 8], 'lr': [ [1e-3, 3e-3, 1e-2], (1e-3, 3e-3, 1e-2), np.array([1e-3, 3e-3, 1e-2]), torch.as_tensor([1e-3, 3e-3, 1e-2], dtype=torch.float), [1e-3, 1e-3, 1e-3], (1e-3, 1e-3, 1e-3), np.array([1e-3, 1e-3, 1e-3]), torch.as_tensor([1e-3, 1e-3, 1e-3], dtype=torch.float), ], 'betas': [ ([0.7, 0.8, 0.9], 0.999), ((0.7, 0.8, 0.9), 0.999), (np.array([0.7, 0.8, 0.9]), 0.999), (torch.as_tensor([0.7, 0.8, 0.9], dtype=torch.float), 0.999), ([0.9, 0.9, 0.9], 0.999), ((0.9, 0.9, 0.9), 0.999), (np.array([0.9, 0.9, 0.9]), 0.999), (torch.as_tensor([0.9, 0.9, 0.9], dtype=torch.float), 0.999), (0.9, [0.777, 0.888, 0.999]), (0.9, (0.777, 0.888, 0.999)), (0.9, np.array([0.777, 0.888, 0.999])), (0.9, torch.as_tensor([0.777, 0.888, 0.999], dtype=torch.float)), (0.9, [0.999, 0.999, 0.999]), (0.9, (0.999, 0.999, 0.999)), (0.9, np.array([0.999, 0.999, 0.999])), (0.9, torch.as_tensor([0.999, 0.999, 0.999], dtype=torch.float)), ([0.7, 0.8, 0.9], [0.777, 0.888, 0.999]), ((0.7, 0.8, 0.9), (0.777, 0.888, 0.999)), (np.array([0.7, 0.8, 0.9]), np.array([0.777, 0.888, 0.999])), ( torch.as_tensor([0.7, 0.8, 0.9], dtype=torch.float), torch.as_tensor([0.777, 0.888, 0.999], dtype=torch.float), ), ([0.9, 0.9, 0.9], [0.999, 0.999, 0.999]), ((0.9, 0.9, 0.9), (0.999, 0.999, 0.999)), (np.array([0.9, 0.9, 0.9]), np.array([0.999, 0.999, 0.999])), ( torch.as_tensor([0.9, 0.9, 0.9], dtype=torch.float), torch.as_tensor([0.999, 0.999, 0.999], dtype=torch.float), ), ], 'eps': [ [1e-7, 1e-8, 1e-9], (1e-7, 1e-8, 1e-9), np.array([1e-7, 1e-8, 1e-9]), torch.as_tensor([1e-7, 1e-8, 1e-9], dtype=torch.float), [1e-8, 1e-8, 1e-8], (1e-8, 1e-8, 1e-8), np.array([1e-8, 1e-8, 1e-8]), torch.as_tensor([1e-8, 1e-8, 1e-8], dtype=torch.float), ], 'weight_decay': [ [0.1, 0.03, 0.0], (0.1, 0.03, 0.0), np.array([0.1, 0.03, 0.0]), torch.as_tensor([0.1, 0.03, 0.0], dtype=torch.float), [0.0, 0.0, 0.0], (0, 0, 0), np.array([0, 0, 0]), torch.as_tensor([0.0, 0.0, 0.0], dtype=torch.float), 0.3, 0.0, ], 'amsgrad': [True], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, )
for b in range(B): embedding_fused.snatch_parameters(embedding_array[b], b) y_array = [embedding_array[b](x_array[b]) for b in range(B)] y_fused_actual = embedding_fused(x_fused) y_fused_expect = torch.cat([y.unsqueeze(0) for y in y_array], dim=0) try: assert_allclose( y_fused_actual.cpu().numpy(), y_fused_expect.cpu().numpy(), rtol=1e-4, ) except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': testcase_automator( testcase, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'input_dim': [(32, ), (16, 16), (8, 8, 8), (128, ), (512, )], 'num_embeddings': [50, 200, 2000], 'embedding_dim': [32, 128, 786], 'padding_idx': [0], 'device': [torch.device('cuda:0')], 'x_dtype': [torch.int, torch.long], 'param_dtype': [torch.float, torch.double], }, )
rtol=1e-4, ) except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': testcase_automator( testcase_MaxPool2d, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'C': [3, 64, 128], 'kernel_size': [1, 3, 4], 'HWin': [32, 256], 'stride': [1, 3, 4], 'padding': [1], 'dilation': [2, 5], 'return_indices': [True], 'ceil_mode': [True], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, ) testcase_automator( testcase_AdaptiveAvgPool2d, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'C': [3, 64, 128], 'HWin': [32, 256],
if __name__ == '__main__': testcase_automator( testcase_StepLR_fused, { 'B': [1, 5, 8], 'step_size': [ (2, 3, 4), [2, 3, 4], np.array([2, 3, 4]), torch.as_tensor([2, 3, 4]), ], 'gamma': [ (0.1, 0.2, 0.3), [0.1, 0.2, 0.3], np.array([0.1, 0.2, 0.3]), torch.as_tensor([0.1, 0.2, 0.3]), ], 'last_epoch': [ 5, (5, 6, 7), [5, 6, 7], np.array([5, 6, 7]), torch.as_tensor([5, 6, 7]), ], }, ) testcase_automator( testcase_StepLR_partially_fused, {
for b in range(B): linear_fused.snatch_parameters(linear_array[b], b) y_array = [linear_array[b](x_array[b]) for b in range(B)] y_fused_actual = linear_fused(x_fused) y_fused_expect = torch.cat([y.unsqueeze(0) for y in y_array], dim=0) try: assert_allclose( y_fused_actual.cpu().numpy(), y_fused_expect.cpu().numpy(), rtol=1e-4, population_threshold=1e-3, ) except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': testcase_automator( testcase, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'L': [1, 16, 32], 'in_features': [10, 256, 1], 'out_features': [100, 10, 1], 'bias': [False], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, )
layernorm_array = [nn.LayerNorm(*args, **kwargs) for _ in range(B)] layernorm_fused = get_hfta_op_for(nn.LayerNorm, B=B)(*args, **kwargs) # Init weights and biases. for b in range(B): layernorm_fused.snatch_parameters(layernorm_array[b], b) y_array = [layernorm_array[b](x_array[b]) for b in range(B)] y_fused_actual = layernorm_fused(x_fused) y_fused_expect = torch.cat([y.unsqueeze(0) for y in y_array], dim=0) try: assert_allclose( y_fused_actual.cpu().numpy(), y_fused_expect.cpu().numpy(), rtol=1e-4, ) except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': testcase_automator( testcase, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'normalized_shape': [(8, 10, 20), (20, ), (10, 20)], 'elementwise_affine': [False], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, )
dropout=0., bias=True, need_weights=True, use_mask=True, ): testcase_hfta(0, N, L, E, num_heads, dropout, bias, need_weights, use_mask) if __name__ == '__main__': testcase_automator( testcase_hfta, { # dropout is not stable, not tested 'B': [1, 3, 5, 10], 'N': [1, 8, 32, 64], 'L': [16, 32, 64], 'E': [16, 32, 64, 128], 'num_heads': [1, 16, 32], 'bias': [False], 'need_weights': [False], 'use_mask': [False], }, ) testcase_automator( testcase_single_model, { 'N': [1, 8, 32, 64], 'L': [16, 32, 64], 'E': [16, 32, 64, 128], 'num_heads': [1, 16, 32], 'bias': [False], 'need_weights': [False],
y_fused_expect.cpu().numpy(), rtol=1e-4, ) except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': testcase_automator( testcase_1d, { 'num_features': [1, 16, 128], 'B': [1, 2, 5, 10], 'N': [16, 64], 'L': [0, 1, 8, 64], 'momentum': [0.01], 'affine': [True, False], 'track_running_stats': [True, False], 'training': [True, False], 'device': [torch.device('cuda:0')], 'dtype': [torch.float, torch.double], }, ) testcase_automator( testcase_2d, { 'num_features': [1, 16, 128], 'B': [1, 2, 5, 10], 'N': [16, 64], 'HWin': [32, 128], 'momentum': [0.01],
except AssertionError as e: dump_error_msg(e) if __name__ == '__main__': # Conv1d unit tests testcase_automator( testcase_Conv1d, { 'B': [1, 2, 5, 10], 'N': [1, 8, 64], 'Cin': [3, 128], 'Cout': [1, 64], 'kernel_size': [1, 5, 7], 'Lin': [32, 128], 'stride': [2], 'padding': [2], 'dilation': [2], 'groups': [2], 'bias': [False], 'padding_mode': ['reflect', 'replicate', 'circular'], 'device': [torch.device('cuda:0')], 'dtype': [torch.float, torch.double], }, ) # Conv2d unit tests testcase_automator( testcase_Conv2d, { 'B': [1, 2, 5, 10],
y = y_fused[:, b, :, :, :] assert y.size(0) == N assert y.size(1) == C for n in range(N): zero_channels = 0 for c in range(C): s = y[n, c].sum() # Each channel either has all zeros or no zeros. try: assert_allclose(s.cpu(), HWin**2 / (1 - p), rtol=1e-4) except AssertionError as e: assert_allclose(s.cpu(), 0, atol=1e-4) # s must be zero at this point. zero_channels += 1 assert_allclose(zero_channels / C, p, rtol=2e-1) if __name__ == '__main__': testcase_automator( testcase, { 'p': [0.25, 0.99], 'B': [1, 8], 'N': [1, 64], 'C': [2000], 'HWin': [8, 32], 'device': [torch.device('cuda:0')], 'dtype': [torch.double], }, )