def test_batchmatmul(self): # define TC lang = """ def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) { Z(b, n, k) +=! X(b, n, mm) * Y(b, mm, k) } """ # create input tensors B, K, M, N = 500, 26, 72, 26 X = torch.randn(B, N, M).cuda() Y = torch.randn(B, M, K).cuda() inputs = [X, Y] # define the mapping_options options = Options("naive") options.useSharedMemory(True) options.usePrivateMemory(True) options.unrollCopyShared(True) options.outerScheduleFusionStrategy("Preserve3Coincident") options.fixParametersBeforeScheduling(True) options.tile([1]) options.tileImperfectlyNested(False) options.mapToBlocks([72, 16, 1]) options.mapToThreads([7, 26]) options.unroll(128) # run with TC, get the outputs and check against reference implementation outputs = self.check(lang, "batch_matmul", options, inputs)
def test_tmm(self): # define TC lang = """ def tmm(float(M,K) A, float(N,K) B) -> (C) { C(m, n) +=! A(m, kk) * B(n, kk) } """ # create input tensors M, N, K = 128, 256, 32 A = torch.randn(M, K).cuda() B = torch.randn(N, K).cuda() inputs = [A, B] # define the mapping_options options = Options("naive") options.useSharedMemory(True) options.usePrivateMemory(True) options.unrollCopyShared(False) options.outerScheduleFusionStrategy("Preserve3Coincident") options.fixParametersBeforeScheduling(False) options.tile([4, 32]) options.tileImperfectlyNested(False) options.mapToBlocks([64, 128]) options.mapToThreads([1, 32]) options.unroll(4) # run with TC, get the outputs and check against reference implementation outputs = self.check(lang, "tmm", options, inputs) expected = torch.mm(A, torch.transpose(B, 0, 1)) diff = outputs[0] - expected self.assert_almost_equal(diff, inputs, M * N, 3e-7)
def test_C3(self): # define TC lang = """ def _C3(float(B, WX) I, float(WY, WX) W) -> (C3) { C3(b, wy) +=! I(b, wxx) * W(wy, wxx) } """ # create input tensors B, WX, WY = 128, 1000, 1024 I = torch.randn(B, WX).cuda() W = torch.randn(WY, WX).cuda() inputs = [I, W] # define the mapping_options options = Options("naive") options.useSharedMemory(True) options.usePrivateMemory(True) options.unrollCopyShared(True) options.outerScheduleFusionStrategy("Preserve3Coincident") options.fixParametersBeforeScheduling(True) options.tile([8, 32, 32]) options.tileImperfectlyNested(False) options.mapToBlocks([128, 128]) options.mapToThreads([1, 32]) options.unroll(256) # run with TC, get the outputs and check against reference implementation outputs = self.check(lang, "_C3", options, inputs)
def test_group_convolution(self): # define TC lang = """ def group_convolution(float(N,G,C,H,W) I, float(G,F,C,KH,KW) W1, float(G,F) B) -> (O) { O(n, g, f, h, w) +=! I(n, g, c, h + kh, w + kw) * W1(g, f, c, kh, kw) O(n, g, f, h, w) = O(n, g, f, h, w) + B(g, f) } """ # create input tensors N, G, C, F, H, W, KH, KW = 32, 32, 32, 32, 7, 7, 3, 3 tI = torch.randn(N, G, C, H, W).cuda() tW = torch.randn(G, F, C, KH, KW).cuda() tB = torch.randn(G, F).cuda() inputs = [tI, tW, tB] # define the mapping_options options = Options("naive") options.useSharedMemory(True) options.usePrivateMemory(False) options.unrollCopyShared(True) options.outerScheduleFusionStrategy("Preserve3Coincident") options.fixParametersBeforeScheduling(False) options.tile([1, 1]) options.tileImperfectlyNested(False) options.mapToBlocks([32, 32, 3]) options.mapToThreads([8, 7, 7]) options.unroll(256) # run with TC, get the outputs and check against reference implementation outputs = self.check(lang, "group_convolution", options, inputs)
def test_options(self): print('\nCreating mapping_options') options = Options("naive") options.useSharedMemory(True) options.unrollCopyShared(False) options.mapToBlocks([256, 8]) options.mapToThreads([4, 16, 4]) options.tile([2, 8, 64, 128]) options.unroll(128) options.fixParametersBeforeScheduling(False) options.scheduleFusionStrategy("Max") options.outerScheduleFusionStrategy("Preserve3Coincident") print('Mapping options created successfully')
def test_KRU3(self): # define TC lang = """ def KRU3_1(float(D2, N2) W2, float(M, N0, N1, N2) X) -> (XW2) { XW2(m, n0, n1, d2) +=! X(m, n0, n1, n2_red) * W2(d2, n2_red) } def KRU3_2(float(D1, N1) W1, float(M, N0, N1, D2) XW2) -> (XW2W1) { XW2W1(m, n0, d1, d2) +=! XW2(m, n0, n1_red, d2) * W1(d1, n1_red) } def KRU3_3(float(D0, N0) W0, float(M, N0, D1, D2) XW2W1) -> (Y) { Y(m, d0, d1, d2) +=! XW2W1(m, n0_red, d1, d2) * W0(d0, n0_red) } """ # create input tensors M, D0, D1, D2, N0, N1, N2, max_factors = 256, 32, 32, 32, 16, 16, 16, 3 W0 = torch.randn(D0, N0).cuda() W1 = torch.randn(D1, N1).cuda() W2 = torch.randn(D2, N2).cuda() X = torch.randn(M, N0, N1, N2).cuda() # define the mapping_options options = Options("naive") options.useSharedMemory(True) options.usePrivateMemory(True) options.tile([4, 1, 1, 8, 16]) options.mapToBlocks([64, 16, 16]) options.mapToThreads([8, 4, 8]) options.unroll(128) # create TC compilation unit object and define the TC language cu = TcCompilationUnit() cu.define(lang) print("Running KRU3_1") inputs1 = [W2, X] outputs1 = cu.compile_and_run("KRU3_1", inputs1, options=options) print("Running KRU3_2") XW2 = outputs1[0] inputs2 = [W1, XW2] outputs2 = cu.compile_and_run("KRU3_2", inputs2, options=options) print("Running KRU3_3") XW2W1 = outputs2[0] inputs3 = [W0, XW2W1] outputs3 = cu.compile_and_run("KRU3_3", inputs3, options=options)
def test_mlp(self): # define TC lang = """ def mlp3(float(B,N) I, float(O,N) W2, float(O) B2, float(P,O) W3, float(P) B3, float(Q,P) W4, float(Q) B4) -> (O2, O3, O4) { O2(b, o) +=! I(b, n) * W2(o, n) O2(b, o) = O2(b, o) + B2(o) O2(b, o) = fmax(O2(b, o), 0) O3(b, p) +=! O2(b, o) * W3(p, o) O3(b, p) = O3(b, p) + B3(p) O3(b, p) = fmax(O3(b, p), 0) O4(b, q) +=! O3(b, p) * W4(q, p) O4(b, q) = O4(b, q) + B4(q) O4(b, q) = fmax(O4(b, q), 0) } """ # create input tensors B, N, O, P, Q = 128, 128, 64, 32, 2 I = torch.randn(B, N).cuda() W2 = torch.randn(O, N).cuda() B2 = torch.randn(O).cuda() W3 = torch.randn(P, O).cuda() B3 = torch.randn(P).cuda() W4 = torch.randn(Q, P).cuda() B4 = torch.randn(Q).cuda() inputs = [I, W2, B2, W3, B3, W4, B4] # define the mapping_options options = Options("naive") options.useSharedMemory(False) options.usePrivateMemory(False) options.unrollCopyShared(True) options.outerScheduleFusionStrategy("Max") options.fixParametersBeforeScheduling(False) options.tile([4]) options.tileImperfectlyNested(False) options.mapToBlocks([128]) options.mapToThreads([64]) options.unroll(128) # run with TC, get the outputs and check against reference implementation outputs = self.check(lang, "mlp3", options, inputs)