def test_link_register(): with Driver() as drv: X = drv.alloc(16, 'uint32') prog = drv.program(link_register) drv.execute(n_threads=1, program=prog, uniforms=[X.address]) assert np.all(X == prog.address + 0x28)
def run_code(code, nout): with Driver() as drv: X = drv.alloc((nout, 16), 'int32') drv.execute(n_threads=1, program=drv.program(boilerplate, code, nout), uniforms=[X.address]) return np.copy(X)
def test_absolute_jump(): with Driver() as drv: X = drv.alloc(16, 'int32') prog = drv.program(absolute_jump) drv.execute(n_threads=1, program=prog, uniforms=[X.address]) assert np.all(X == ASSERT_OK)
def test_unpack_R4(): F = np.random.randn(16) X = np.zeros((7, 16), dtype='uint32') X[0] = unpack('16L', F.astype('float32')) X[1] = unpack('16H', F.astype('float16')) X[2] = unpack('16H', F.astype('float16')) X[2] <<= 16 X[3:7] = np.array([getrandbits(32) for i in range(4 * 16)]).reshape(4, 16) with Driver() as drv: X = drv.copy(X) Y = drv.alloc((7, 16), dtype='uint32') drv.execute(n_threads=1, program=drv.program(unpack_R4), uniforms=[X.address, Y.address]) X = np.copy(X) Y = np.copy(Y) assert np.allclose(F, unpack('16f', Y[0]), rtol=1e-3) assert np.allclose(F, unpack('16f', Y[1]), rtol=1e-3) assert np.allclose(F, unpack('16f', Y[2]), rtol=1e-3) assert np.allclose(((X[3] >> 0) & 0xff) / 255.0, unpack('16f', Y[3]), rtol=1e-7) assert np.allclose(((X[4] >> 8) & 0xff) / 255.0, unpack('16f', Y[4]), rtol=1e-7) assert np.allclose(((X[5] >> 16) & 0xff) / 255.0, unpack('16f', Y[5]), rtol=1e-7) assert np.allclose(((X[6] >> 24) & 0xff) / 255.0, unpack('16f', Y[6]), rtol=1e-7)
def run_code(code, X): with Driver() as drv: X = drv.copy(X) Y = drv.copy(X) drv.execute(n_threads=1, program=drv.program(boilerplate, code, X.shape[0]), uniforms=[X.address, Y.address]) return np.copy(Y)
def test_with_namespace(): with Driver() as drv: X = drv.alloc((1, 16), 'int32') X[:] = 1234 drv.execute(n_threads=1, program=drv.program(with_namespace), uniforms=[X.address]) assert np.all(X == 4)
def run_code(code, X, output_shape, output_type): with Driver() as drv: X = drv.copy(X) Y = drv.alloc(output_shape, dtype=output_type) drv.execute(n_threads=1, program=drv.program(boilerplate, code, output_shape[0]), uniforms=[X.address, Y.address]) return np.copy(Y)
def test_horizontal_32bit_stride_load(): with Driver() as drv: X = drv.alloc((16, 32), dtype='uint32') X[:] = np.arange(16 * 32).reshape(16, 32).astype('uint32') Y = drv.alloc((16, 16), dtype='uint32') drv.execute(n_threads=1, program=drv.program(horizontal_32bit_stride_load), uniforms=[X.address, Y.address]) assert np.all(X[:, :16] == Y)
def test_horizontal_32bit_partial(): with Driver() as drv: X = drv.alloc((8, 8), dtype='uint32') X[:] = np.arange(8 * 8).reshape(8, 8).astype('uint32') Y = drv.alloc((16, 16), dtype='uint32') drv.execute(n_threads=1, program=drv.program(horizontal_32bit_partial), uniforms=[X.address, Y.address]) assert np.all(X == Y[4:12, 4:12])
def test_vertical_32bit_load(): with Driver() as drv: X = drv.alloc((16, 64), dtype='uint32') X[:] = np.arange(16 * 64).reshape(16, 64).astype('uint32') Y = drv.alloc((64, 16), dtype='uint32') drv.execute(n_threads=1, program=drv.program(vertical_32bit_load), uniforms=[X.address, Y.address]) assert np.all(X == Y.T)
def test_horizontal_32bit_load_calc_and_store_another_buffer(): with Driver() as drv: X = drv.alloc((64, 16), dtype='uint32') X[:] = np.arange(64 * 16).reshape(64, 16).astype('uint32') Y = drv.alloc((64, 16), dtype='uint32') drv.execute(n_threads=1, program=drv.program(horizontal_32bit_load_calc_and_store), uniforms=[X.address, Y.address]) X[0] = X[0] + 1 assert np.all(X == Y)
def test_rotate_r4(): d = np.array([random.getrandbits(32) for i in range(16)]).astype(np.uint32) with Driver() as drv: addr = drv.copy(d).address X = np.array([addr+4*i for i in range(16)], dtype=np.uint32) Y = run_code(rotate_r4, X, 1+15+16) assert np.alltrue(Y[0] == d) for i in range(1, 16): Y_ref = list_half_rotate(d, i) assert np.alltrue(Y[i] == Y_ref) for i in range(0, 16): Y_ref = list_half_rotate(d, i) assert np.alltrue(Y[16+i] == Y_ref)
def test_semaphore(): with Driver() as drv: nthreads = 10 X = drv.alloc(16, dtype='uint32') Y = drv.alloc(16, dtype='uint32') X[:] = 0 unifs = np.zeros((nthreads, 3), dtype='uint32') unifs[:, 0] = X.address unifs[:, 1] = Y.address unifs[:, 2] = np.arange(nthreads) drv.execute(n_threads=nthreads, program=drv.program(increment_thread, nthreads), uniforms=unifs) assert np.all(Y == nthreads * 10000)
def test_given_jump(): lbls = get_label_positions(given_jmp) entry_pc = 0 test_pc = 0 for lbl, pc in lbls: if lbl.name == 'entry': entry_pc = pc if lbl.name == 'test': test_pc = pc with Driver() as drv: X = drv.alloc((1, 16), 'int32') X[:] = 1234 drv.execute(n_threads=1, program=drv.program(given_jmp), uniforms=[test_pc - entry_pc - 32, X.address]) assert np.all(X == 4)
def main(): with Driver() as drv: p = 96 q = 363 r = 3072 p_div = 2 r_div = 6 n_threads = p_div * r_div assert (p % 16 == 0 and p >= p_div * 16) assert (q >= 2) assert (r % 64 == 0 and r >= r_div * 64) # Allocate matrices. C = drv.alloc((p, r), 'float32') A = drv.alloc((p, q), 'float32') B = drv.alloc((q, r), 'float32') # Initialize matrices. np.random.seed(0) alpha = 1.0 beta = 1.0 A[:] = np.random.randn(p, q) B[:] = np.random.randn(q, r) C[:] = np.random.randn(p, r) # Reference start = time.time() R = alpha * A.dot(B) + beta * C elapsed_ref = time.time() - start # Allocate uniforms. uniforms = drv.alloc((n_threads, 14), 'uint32') uniforms[:, 0] = uniforms.addresses()[:, 0] th = 0 h = (p + 16 * p_div - 1) // (16 * p_div) w = (r + 64 * r_div - 1) // (64 * r_div) for i in range(p_div): for j in range(r_div): uniforms[th, 1] = h if i != p_div - 1 else (p - i * h * 16) // 16 uniforms[th, 2] = q uniforms[th, 3] = w if j != r_div - 1 else (r - j * w * 64) // 64 uniforms[th, 4] = A.addresses()[i * 16 * h, 0] uniforms[th, 5] = B.addresses()[0, j * 64 * w] uniforms[th, 6] = C.addresses()[i * 16 * h, j * 64 * w] th += 1 uniforms[:, 7] = A.strides[0] uniforms[:, 8] = B.strides[0] uniforms[:, 9] = C.strides[0] uniforms[:, 10] = struct.unpack('L', struct.pack('f', alpha))[0] uniforms[:, 11] = struct.unpack('L', struct.pack('f', beta))[0] uniforms[:, 12] = np.arange(n_threads) uniforms[:, 13] = n_threads # Allocate GPU program. code = drv.program(sgemm_gpu_code) # GPU start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) elapsed_gpu = time.time() - start def Gflops(sec): return (2 * p * q * r + 3 * p * r) / sec * 1e-9 print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p, q=q, r=r)) print('threads: {}'.format(n_threads)) print('numpy: {:.4f} sec, {:.4f} Gflops'.format( elapsed_ref, Gflops(elapsed_ref))) print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu, Gflops(elapsed_gpu))) print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(R - C))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(R - C))))) print('minimum relative error: {:.4e}'.format( float(np.min(np.abs((R - C) / R))))) print('maximum relative error: {:.4e}'.format( float(np.max(np.abs((R - C) / R)))))
def main(): with Driver() as drv: p = 96 q = 363 r = 3072 assert (p % 16 == 0) assert (q >= 2) assert (r % 64 == 0) # Allocate matrices. C = drv.alloc((p, r), 'float32') A = drv.alloc((p, q), 'float32') B = drv.alloc((q, r), 'float32') # Initialize matrices. np.random.seed(0) alpha = 1.0 beta = 1.0 A[:] = np.random.randn(p, q) B[:] = np.random.randn(q, r) C[:] = np.random.randn(p, r) # Reference start = time.time() R = alpha * A.dot(B) + beta * C elapsed_ref = time.time() - start # Allocate uniforms. uniforms = drv.alloc(12, 'uint32') uniforms[0] = uniforms.address uniforms[1] = p / 16 uniforms[2] = q uniforms[3] = r / 64 uniforms[4] = A.address uniforms[5] = B.address uniforms[6] = C.address uniforms[7] = A.strides[0] uniforms[8] = B.strides[0] uniforms[9] = C.strides[0] uniforms[10] = struct.unpack('L', struct.pack('f', alpha))[0] uniforms[11] = struct.unpack('L', struct.pack('f', beta))[0] # Allocate GPU program. code = drv.program(sgemm_gpu_code) # GPU start = time.time() drv.execute(n_threads=1, program=code, uniforms=uniforms) elapsed_gpu = time.time() - start def Gflops(sec): return (2 * p * q * r + 3 * p * r) / sec * 1e-9 print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p, q=q, r=r)) print('threads: {}'.format(1)) print('numpy: {:.4f} sec, {:.4f} Gflops'.format( elapsed_ref, Gflops(elapsed_ref))) print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu, Gflops(elapsed_gpu))) print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(R - C))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(R - C))))) print('minimum relative error: {:.4e}'.format( float(np.min(np.abs((R - C) / R))))) print('maximum relative error: {:.4e}'.format( float(np.max(np.abs((R - C) / R)))))
def GPU_conv(x, w, b, Relu_flag=0): #def main(): with Driver() as drv: SIMD = 16 UNIFORM = 64 n_threads = 12 N, C, H, W = x.shape FN, C, FH, FW = w.shape calc_H = H calc_W = W calc_FN = FN eH = int(FH / 2) * 2 eW = int(FW / 2) * 2 oH = H - eH oW = W - eW modH = oH % n_threads modW = oW % SIMD modFN = FN % SIMD if (modH != 0): calc_H += n_threads - modH if (modW != 0): calc_W += SIMD - modW if (modFN != 0): calc_FN += SIMD - modFN calc_oH = calc_H - eH calc_oW = calc_W - eW th_oH = int(calc_oH / n_threads) th_iter = int((th_oH * calc_oW) / (64 / calc_FN * 16)) convX = drv.alloc((N, C, calc_H, calc_W), 'float32') convW = drv.alloc((C, FH, FW, calc_FN), 'float32') convout = drv.alloc((1, calc_oH, calc_oW, calc_FN), 'float32') cb = drv.alloc(calc_FN, 'float32') convout[:] = 0 convX[:] = 0 convW[:] = 0 cb[:] = 0 pad = 0 stride = 1 convX[:, :, :H, :W] = x[:] convW[:, :, :, :FN] = w.transpose(1, 2, 3, 0)[:] #転置してcopy cb[:FN] = b[:] #CPU Calculation #im2col->dot cpuetime = 0 start = time.time() out_h = 1 + int((H + 2 * pad - FH) / stride) out_w = 1 + int((W + 2 * pad - FW) / stride) col = im2col(x, FH, FW, stride, pad) col_W = w.reshape(FN, -1).T out = np.dot(col, col_W) + b out = np.maximum(out, 0.0) CPU = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) cetime = time.time() - start uniforms = drv.alloc((n_threads, 16), 'uint32') uniforms[:, 0] = convW.addresses()[0, 0, 0, 0] for th in range(n_threads): uniforms[th, 1] = convX.addresses()[0, 0, th * th_oH, 0] uniforms[th, 2] = convout.addresses()[0, th * th_oH, 0, 0] uniforms[:, 3] = cb.addresses()[0] uniforms[:, 4] = th_iter uniforms[:, 5] = th_oH uniforms[:, 6] = int(calc_W * 4) uniforms[:, 7] = C uniforms[:, 8] = np.arange(1, (n_threads + 1)) uniforms[:, 9] = n_threads uniforms[:, 10] = Relu_flag + 1 code = drv.program(conv, calc_H, calc_W, FH, FW, calc_FN, calc_oH, calc_oW) #引数渡し start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) getime = time.time() - start GPU = np.zeros((C, FN, oH, oW)) convout = convout.transpose(0, 3, 1, 2) GPU[:] = convout[:, :FN, :oH, :oW] print("===========Conv&Relu=============") print("x size:{0},w size:{1}".format(x.shape, w.shape)) print("CPU time:{:.4f}".format(cetime * 1000), "[msec]") print("GPU time:{:.4f}".format(getime * 1000), "[msec]") print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(CPU[:] - GPU[:]))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(CPU[:] - GPU[:]))))) #print(CPU[:,:,:,:]) #print(GPU[:,:,:,:]) return GPU
def main(): with Driver() as drv: p = random.randint(64 * 12, 1024) q = random.randint(2, 512) r = random.randint(64 * 12, 1024) assert (q >= 2) p_div = 2 r_div = 6 n_threads = p_div * r_div # Allocate matrices. C = drv.alloc((p, r), 'float32') A = drv.alloc((p, q), 'float32') B = drv.alloc((q, r), 'float32') # Initialize matrices. np.random.seed(0) alpha = 1.0 beta = 1.0 A[:] = np.random.randn(p, q) # np.ones(shape=(p, q)) # B[:] = np.random.randn(q, r) # np.ones(shape=(q, r)) # C[:] = np.random.randn( p, r) # np.ones(shape=(p, r)) # np.arange(p*r).reshape(p, r) + 1 # Reference RA = A.copy() RB = B.copy() RC = C.copy() start = time.time() R = alpha * RA.dot(RB) + beta * RC elapsed_ref = time.time() - start # Allocate uniforms. uniforms = drv.alloc((n_threads, 14), 'uint32') uniforms[:, 0] = uniforms.addresses()[:, 0] th = 0 p_up = p // 16 h = (p_up + p_div - 1) // p_div h_len = p_div - (h * p_div - p_up) r_up = r // 64 w = (r_up + r_div - 1) // r_div w_len = r_div - (w * r_div - r_up) h_acc = 0 for i in range(p_div): hi = 0 if i == p_div - 1: hi = p - h_acc else: hi = 16 * h if i < h_len else 16 * (h - 1) w_acc = 0 for j in range(r_div): wj = 0 if j == r_div - 1: wj = r - w_acc else: wj = 64 * w if j < w_len else 64 * (w - 1) uniforms[th, 1] = hi uniforms[th, 2] = q uniforms[th, 3] = wj uniforms[th, 4] = A.addresses()[h_acc, 0] uniforms[th, 5] = B.addresses()[0, w_acc] uniforms[th, 6] = C.addresses()[h_acc, w_acc] th += 1 w_acc += wj h_acc += hi uniforms[:, 7] = A.strides[0] uniforms[:, 8] = B.strides[0] uniforms[:, 9] = C.strides[0] uniforms[:, 10] = struct.unpack('L', struct.pack('f', alpha))[0] uniforms[:, 11] = struct.unpack('L', struct.pack('f', beta))[0] uniforms[:, 12] = np.arange(n_threads) uniforms[:, 13] = n_threads # Allocate GPU program. code = drv.program(sgemm_gpu_code) # GPU start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) elapsed_gpu = time.time() - start # Image.fromarray(R.astype(np.uint8)).save("expected.png") # Image.fromarray(C.astype(np.uint8)).save("sgemm.png") np.set_printoptions(threshold=np.inf) # print(R.astype(int)) # print(C.astype(int)) def Gflops(sec): return (2 * p * q * r + 3 * p * r) / sec * 1e-9 print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p, q=q, r=r)) print('threads: {}'.format(n_threads)) print('numpy: {:.4f} sec, {:.4f} Gflops'.format( elapsed_ref, Gflops(elapsed_ref))) print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu, Gflops(elapsed_gpu))) print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(R - C))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(R - C))))) print('minimum relative error: {:.4e}'.format( float(np.min(np.abs((R - C) / R))))) print('maximum relative error: {:.4e}'.format( float(np.max(np.abs((R - C) / R)))))
def GPU_dot(x, w, b, Relu_flag=0): with Driver() as drv: SIMD = 16 UNIFORM = 64 n_threads = 12 if (x.ndim == 4): N, C, H, W = x.shape else: N = 1 C = 1 H, W = x.shape p = 1 q = C * H * W r = w.shape[1] cal_q = q cal_r = r #rとqの調整 rmod = r % SIMD if rmod != 0: cal_r += SIMD - rmod qmod = q % n_threads if qmod != 0: cal_q += n_threads - qmod q_th = int(cal_q / n_threads) #1thあたりのqの担当量 q_uni_iter = int(cal_q / n_threads / UNIFORM) #uniformの繰り返し回数 q_uni_mod = int((cal_q / n_threads % UNIFORM)) #uniformのあまり分 r_simd_iter = int(cal_r / SIMD) A = drv.alloc((p, cal_q), 'float32') B = drv.alloc((cal_q, cal_r), 'float32') C = drv.alloc((p, cal_r), 'float32') out = drv.alloc((p, cal_r), 'float32') out[:] = A[:] = B[:] = C[:] = 0.0 A[:, :q] = x.reshape(1, q)[:] B[:q, :r] = w[:] C[:, :r] = b[:] cetime = 0 start = time.time() xx = x.reshape(x.shape[0], -1) if (Relu_flag == 0): CPUout = np.maximum(np.dot(A, B) + C, 0.0) else: CPUout = np.dot(A, B) + C cetime = time.time() - start uniforms = drv.alloc((n_threads, 16), 'uint32') for th in range(n_threads): uniforms[th, 0] = A.addresses()[0, int(th * q_th)] uniforms[th, 1] = B.addresses()[int(th * q_th), 0] uniforms[:, 2] = out.addresses()[0, 0] uniforms[:, 3] = C.addresses()[0, 0] uniforms[:, 4] = q_uni_iter uniforms[:, 5] = q_uni_mod + 1 uniforms[:, 6] = np.arange(1, (n_threads + 1)) uniforms[:, 7] = n_threads uniforms[:, 8] = Relu_flag + 1 code = drv.program(dot, r_simd_iter) getime = 0 start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) getime = time.time() - start out_r = np.zeros((p, r)) out_r[:] = out[:, :r] print("===========Affine&Relu=============") if Relu_flag == 1: print("x size:{0},w size:{1},Relu:×".format(x.shape, w.shape)) else: print("x size:{0},w size:{1},Relu:〇".format(x.shape, w.shape)) print("CPU time:{:.4f}[msec]".format(cetime * 1000)) print("GPU time:{:.4f}[msec]".format(getime * 1000)) print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(CPUout[:, :r] - out_r[:, :r]))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(CPUout[:, :r] - out_r[:, :r]))))) return out_r
def GPU_pool(x, stride, pad): #def main(): with Driver() as drv: SIMD = 16 UNIFORM = 64 n_threads = 12 #N=1;C=30;H=24;W=24 N, C, H, W = x.shape cal_C = C Cmod = C % SIMD if Cmod != 0: cal_C += SIMD - Cmod FH = 2 FW = 2 oH = int(H / FH) oW = int(W / FW) th_oH = int(oH / n_threads) th_iter = int((th_oH * oW) / SIMD) X = drv.alloc((N, H, W, cal_C), 'float32') out = drv.alloc((1, oH, oW, cal_C), 'float32') X[:] = 0 X[:, :, :, :C] = x.transpose(0, 2, 3, 1)[:] """ x=np.random.randn(N,cal_C,H,W) x=np.arange(N*cal_C*H*W).reshape(N,cal_C,H,W) X[:]=x.transpose(0,2,3,1) """ cetime = 0 start = time.time() out_h = int(1 + (H - FH) / stride) out_w = int(1 + (W - FW) / stride) col = im2col(x, FH, FW, stride, pad) col = col.reshape(-1, FH * FW) arg_max = np.argmax(col, axis=1) CPUout = np.max(col, axis=1) CPUout = CPUout.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2) cetime = time.time() - start uniforms = drv.alloc((n_threads, 16), 'uint32') for th in range(n_threads): uniforms[th, 0] = X.addresses()[0, th * th_oH * stride, 0, 0] uniforms[th, 1] = out.addresses()[0, th * th_oH, 0, 0] uniforms[:, 2] = np.arange(1, (n_threads + 1)) uniforms[:, 3] = n_threads code = drv.program(pool, H, W, cal_C, stride) getime = 0 start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) getime = time.time() - start print("===========Pooling=============") print("x size:{0},stride:{1},pad:{2}".format(x.shape, stride, pad)) print("CPU time:{:.4f}[msec]".format(cetime * 1000)) print("GPU time:{:.4f}[msec]".format(getime * 1000)) """ print("GPU time:{0}".format(etime*1000),"[msec]") print("CPU time:{0}".format(cpuetime*1000),"[msec]") """ out_r = np.zeros((1, C, oH, oW)) out_r[:] = out.transpose(0, 3, 1, 2)[:, :C, :, :] print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(CPUout[:] - out_r[:]))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(CPUout[:] - out_r[:]))))) """ print('minimum relative error: {:.4e}'.format( float(np.min(np.abs((CPUout - out_r) / CPUout))))) print('maximum relative error: {:.4e}'.format( float(np.max(np.abs((CPUout - out_r) / CPUout))))) print("GPU{0}".format(out_r)) print("CPU{0}".format(CPUout)) """ return out_r
def main(): with Driver() as drv: p = random.randint(1, 1024) q = random.randint(2, 512) r = random.randint(1, 1024) assert(q >= 2) # Allocate matrices. C = drv.alloc((p, r), 'float32') A = drv.alloc((p, q), 'float32') B = drv.alloc((q, r), 'float32') # Initialize matrices. np.random.seed(0) alpha = 1.0 beta = 1.0 A[:] = np.random.randn(p, q) # np.ones(shape=(p, q)) # B[:] = np.random.randn(q, r) # np.ones(shape=(q, r)) # C[:] = np.random.randn(p, r) # np.ones(shape=(p, r)) # np.arange(p*r).reshape(p, r) + 1 # # Reference RA = A.copy() RB = B.copy() RC = C.copy() start = time.time() R = alpha*RA.dot(RB) + beta*RC elapsed_ref = time.time() - start # Allocate uniforms. uniforms = drv.alloc(12, 'uint32') uniforms[0] = uniforms.address uniforms[1] = p uniforms[2] = q uniforms[3] = r uniforms[4] = A.address uniforms[5] = B.address uniforms[6] = C.address uniforms[7] = A.strides[0] uniforms[8] = B.strides[0] uniforms[9] = C.strides[0] uniforms[10] = struct.unpack('L', struct.pack('f', alpha))[0] uniforms[11] = struct.unpack('L', struct.pack('f', beta))[0] # Allocate GPU program. code = drv.program(sgemm_gpu_code) # GPU start = time.time() drv.execute( n_threads=1, program=code, uniforms=uniforms ) elapsed_gpu = time.time() - start # Image.fromarray(R.astype(np.uint8)).save("expected.png") # Image.fromarray(C.astype(np.uint8)).save("sgemm.png") # np.set_printoptions(threshold=np.inf) # print(C.astype(int)) def Gflops(sec): return (2*p*q*r + 3*p*r)/sec * 1e-9 print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format( p=p, q=q, r=r)) print('threads: {}'.format(1)) print('numpy: {:.4f} sec, {:.4f} Gflops'.format( elapsed_ref, Gflops(elapsed_ref))) print('GPU: {:.4f} sec, {:.4f} Gflops'.format( elapsed_gpu, Gflops(elapsed_gpu))) print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(R - C))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(R - C))))) print('minimum relative error: {:.4e}'.format( float(np.min(np.abs((R - C) / R))))) print('maximum relative error: {:.4e}'.format( float(np.max(np.abs((R - C) / R)))))
rotate(broadcast, r2, -THR_NM) iadd(r0, r5, -1, set_flags=True) L.sem_down jzc(L.sem_down) sema_down(COMPLETED) # すべてのスレッドが終了するまで待つ nop() iadd(r0, r0, -1) interrupt() L.skip_fin exit(interrupt=False) with Driver() as drv: # 画像サイズ H = 360 W = 320 n_threads = 12 SIMD = 16 R = 60 th_H = int(H / n_threads) #1スレッドの担当行 th_ele = th_H * W #1スレッドの担当要素 io_iter = int(th_ele / (R * SIMD)) #何回転送するか IN = drv.alloc((H, W), 'float32') OUT = drv.alloc((H, W), 'float32') OUT[:] = 0.0
def main(): with Driver() as drv: class Color: BLACK = '\033[30m' RED = '\033[31m' GREEN = '\033[32m' YELLOW = '\033[33m' BLUE = '\033[34m' PURPLE = '\033[35m' CYAN = '\033[36m' WHITE = '\033[37m' END = '\033[0m' BOLD = '\038[1m' UNDERLINE = '\033[4m' INVISIBLE = '\033[08m' REVERCE = '\033[07m' SIMD = 16 UNIFORM = 64 n_threads = 12 N = 1 C = 3 H = 64 W = 64 FN = 16 FH = 5 FW = 5 Relu_flag = 1 x = np.random.randn(N, C, H, W) w = np.random.randn(FN, C, FH, FW) b = np.random.randn(FN) N, C, H, W = x.shape FN, C, FH, FW = w.shape calc_H = H calc_W = W calc_FN = FN eH = int(FH / 2) * 2 eW = int(FW / 2) * 2 oH = H - eH oW = W - eW modH = oH % n_threads modW = oW % SIMD modFN = FN % SIMD if (modH != 0): calc_H += n_threads - modH if (modW != 0): calc_W += SIMD - modW if (modFN != 0): calc_FN += SIMD - modFN calc_oH = calc_H - eH calc_oW = calc_W - eW th_oH = int(calc_oH / n_threads) th_iter = int((th_oH * calc_oW) / (64 / calc_FN * 16)) convX = drv.alloc((N, C, calc_H, calc_W), 'float32') convW = drv.alloc((C, FH, FW, calc_FN), 'float32') convout = drv.alloc((1, calc_oH, calc_oW, calc_FN), 'float32') cb = drv.alloc(calc_FN, 'float32') convout[:] = 0 convX[:] = 0 convW[:] = 0 cb[:] = 0 pad = 0 stride = 1 convX[:, :, :H, :W] = x[:] convW[:, :, :, :FN] = w.transpose(1, 2, 3, 0)[:] #転置してcopy cb[:FN] = b[:] uniforms = drv.alloc((n_threads, 16), 'uint32') uniforms[:, 0] = convW.addresses()[0, 0, 0, 0] for th in range(n_threads): uniforms[th, 1] = convX.addresses()[0, 0, th * th_oH, 0] uniforms[th, 2] = convout.addresses()[0, th * th_oH, 0, 0] uniforms[:, 3] = cb.addresses()[0] uniforms[:, 4] = th_iter uniforms[:, 5] = th_oH uniforms[:, 6] = int(calc_W * 4) uniforms[:, 7] = C uniforms[:, 8] = np.arange(1, (n_threads + 1)) uniforms[:, 9] = n_threads uniforms[:, 10] = Relu_flag + 1 code = drv.program(conv, calc_H, calc_W, FH, FW, calc_FN, calc_oH, calc_oW) #引数渡し while (1): #CPU Calculation #im2col->dot cpuetime = 0 start = time.time() out_h = 1 + int((H + 2 * pad - FH) / stride) out_w = 1 + int((W + 2 * pad - FW) / stride) col = im2col(x, FH, FW, stride, pad) col_W = w.reshape(FN, -1).T out = np.dot(col, col_W) + b #out = np.maximum(out,0.0) CPU = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) cetime = time.time() - start start = time.time() drv.execute(n_threads=n_threads, program=code, uniforms=uniforms) getime = time.time() - start GPU = np.zeros((C, FN, oH, oW)) tranout = convout.transpose(0, 3, 1, 2) GPU[:] = tranout[:, :FN, :oH, :oW] print("===========畳み込み層=============") print("x size:{0},w size:{1}".format(x.shape, w.shape)) print("CPU time:{:.4f}".format(cetime * 1000), "[msec]") print("GPU time:{:.4f}".format(getime * 1000), "[msec]") print('minimum absolute error: {:.4e}'.format( float(np.min(np.abs(CPU[:] - GPU[:]))))) print('maximum absolute error: {:.4e}'.format( float(np.max(np.abs(CPU[:] - GPU[:]))))) print(Color.GREEN + "{:.2f}倍高速化!!!".format(cetime / getime) + Color.END) convout[:] = 0 time.sleep(3)