def test_broadcast_alias(): with Driver() as drv: code = drv.program(qpu_broadcast_alias) X = drv.alloc((16, ), dtype='int32') Y = drv.alloc((len(range(-15, 16)), 16), dtype='int32') unif = drv.alloc(3, dtype='uint32') X[:] = np.arange(16) Y[:] = 0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() expected = X for ix, rot in enumerate(range(-15, 16)): assert (Y[ix] == expected[(-rot % 16)].repeat(16)).all()
def boilerplate_sfu_regs(sfu_regs, domain_limitter): with Driver() as drv: code = drv.program(lambda asm: qpu_sfu_regs(asm, sfu_regs)) X = drv.alloc((16, ), dtype='float32') Y = drv.alloc((len(sfu_regs), 16), dtype='float32') unif = drv.alloc(3, dtype='uint32') X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32')) Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() for ix, reg in enumerate(sfu_regs): msg = 'mov({}, None)'.format(reg) assert np.allclose(Y[ix], ops[reg](X), rtol=1e-4), msg
def test_parallel_16(): with Driver() as drv: thread = 16 serial_code = drv.program(qpu_serial) parallel_code = drv.program(qpu_parallel_16) X = drv.alloc((thread, 16), dtype='float32') Ys = drv.alloc((thread, 16), dtype='float32') Yp = drv.alloc((thread, 16), dtype='float32') unif = drv.alloc((thread, 4), dtype='uint32') X[:] = np.random.randn(*X.shape) Ys[:] = -1 Yp[:] = -1 unif[:, 0] = unif.addresses()[:, 0] unif[:, 1] = unif.shape[1] unif[:, 2] = X.addresses()[:, 0] unif[:, 3] = Ys.addresses()[:, 0] start = time.time() drv.execute(serial_code, unif.addresses()[0, 0]) end = time.time() serial_cost = end - start unif[:, 3] = Yp.addresses()[:, 0] start = time.time() drv.execute(parallel_code, unif.addresses()[0, 0], thread=thread) end = time.time() parallel_cost = end - start np.set_printoptions(threshold=np.inf) assert (X == Ys).all() assert (X == Yp).all() assert parallel_cost < serial_cost * 2
def test_signal_ldtmu(): with Driver() as drv: code = drv.program(qpu_signal_ldtmu) X = drv.alloc((16, ), dtype='float32') Y = drv.alloc((3, 16), dtype='float32') unif = drv.alloc(3, dtype='uint32') X[:] = np.random.randn(*X.shape).astype('float32') Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() assert (Y[0] == X).all() assert (Y[1] == 2).all() assert (Y[2] == 4).all()
def test_rotate_alias(): with Driver() as drv: code = drv.program(qpu_rotate_alias) X = drv.alloc((16, ), dtype='int32') Y = drv.alloc((2, len(range(-15, 16)), 16), dtype='int32') unif = drv.alloc(3, dtype='uint32') X[:] = np.arange(16) Y[:] = 0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() expected = np.concatenate([X, X]) for ix, rot in enumerate(range(-15, 16)): assert (Y[:, ix] == expected[(-rot % 16):(-rot % 16) + 16]).all()
def scopy(*, length, num_qpus=8, unroll_shift=0): assert length > 0 assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0 print(f'==== scopy example ({length / 1024 / 1024} Mi elements) ====') with Driver(data_area_size=(length * 2 + 1024) * 4) as drv: code = drv.program(qpu_scopy, num_qpus=num_qpus, unroll_shift=unroll_shift, code_offset=drv.code_pos // 8) print('Preparing for buffers...') X = drv.alloc(length, dtype='float32') Y = drv.alloc(length, dtype='float32') X[:] = np.arange(*X.shape, dtype=X.dtype) Y[:] = -X assert not np.array_equal(X, Y) unif = drv.alloc(3, dtype='uint32') unif[0] = length unif[1] = X.addresses()[0] unif[2] = Y.addresses()[0] print('Executing on QPU...') start = monotonic() drv.execute(code, unif.addresses()[0], thread=num_qpus) end = monotonic() assert np.array_equal(X, Y) print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
def summation(*, length, num_qpus=8, unroll_shift=5): assert length > 0 assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0 print(f'==== summaton example ({length / 1024 / 1024} Mi elements) ====') with Driver(data_area_size=(length + 1024) * 4) as drv: code = drv.program(qpu_summation, num_qpus=num_qpus, unroll_shift=unroll_shift, code_offset=drv.code_pos // 8) print('Preparing for buffers...') X = drv.alloc(length, dtype='uint32') Y = drv.alloc(16 * num_qpus, dtype='uint32') X[:] = np.arange(length, dtype=X.dtype) Y.fill(0) assert sum(Y) == 0 unif = drv.alloc(3, dtype='uint32') unif[0] = length unif[1] = X.addresses()[0] unif[2] = Y.addresses()[0] print('Executing on QPU...') start = monotonic() drv.execute(code, unif.addresses()[0], thread=num_qpus) end = monotonic() assert sum(Y) % 2**32 == (length - 1) * length // 2 % 2**32 print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
def test_quad_rotate_alias(): with Driver() as drv: code = drv.program(qpu_quad_rotate_alias) X = drv.alloc((16, ), dtype='int32') Y = drv.alloc((4, len(range(-15, 16)), 16), dtype='int32') unif = drv.alloc(3, dtype='uint32') X[:] = np.arange(16) Y[:] = 0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() expected = np.concatenate([X.reshape(4, 4)] * 2, axis=1) for ix, rot in enumerate(range(-15, 16)): assert (Y[:, ix] == expected[:, (-rot % 4):(-rot % 4) + 4].ravel()).all()
def test_branch_link_reg(): for set_subroutine_link, expected in [(False, 2), (True, 1)]: for use_link_reg_direct in [False, True]: with Driver() as drv: code = drv.program(lambda asm: qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct)) X = drv.alloc(16, dtype = 'uint32') Y = drv.alloc((2, 16), dtype = 'uint32') unif = drv.alloc(2, dtype = 'uint32') X[:] = (np.random.randn(16) * 1024).astype('uint32') Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0,0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() assert (Y[0] == X).all() assert (Y[1] == expected).all()
def test_tmu_write(): print() n = 256 * 1024 with Driver(data_area_size = n * 16 * 4 + 2 * 4) as drv: code = drv.program(qpu_tmu_write) data = drv.alloc(n * 16, dtype = 'uint32') unif = drv.alloc(2, dtype = 'uint32') data[:] = 0xdeadbeaf unif[0] = n unif[1] = data.addresses()[0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() assert all(data == range(n * 16)) print(f'{end - start} sec') print(f'{data.nbytes / (end - start) / 1000 / 1000} MB/s')
def boilerplate_unary_ops(uni_ops, dst, src): dst_dtype, dst_ops = dst src_dtype, src_ops = src with Driver() as drv: cases = list(itertools.product(uni_ops, dst_ops, src_ops)) code = drv.program( lambda asm: qpu_unary_ops(asm, uni_ops, dst_ops, src_ops)) X = drv.alloc((16 * 4 // np.dtype(src_dtype).itemsize, ), dtype=src_dtype) Y = drv.alloc((len(cases), 16 * 4 // np.dtype(dst_dtype).itemsize), dtype=dst_dtype) unif = drv.alloc(3, dtype='uint32') X[:] = np.random.uniform(-(2**15), 2**15, X.shape).astype(src_dtype) Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0, 0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() for ix, (uni_op, dst_op, src_op) in enumerate(cases): msg = '{}({}, {})'.format(uni_op, dst_op, src_op) if np.dtype(dst_dtype).name.startswith('float'): assert np.allclose(ops[dst_op](Y[ix]), ops[uni_op](ops[src_op](X)), rtol=1e-2), msg elif np.dtype(dst_dtype).name.startswith('int') or np.dtype( dst_dtype).name.startswith('uint'): assert np.all( ops[dst_op](Y[ix]) == ops[uni_op](ops[src_op](X))), msg
def test_branch_abs_imm(): with Driver() as drv: @qpu def qpu_dummy(asm): nop() dummy = drv.program(qpu_dummy) code = drv.program(lambda asm: qpu_branch_abs_imm(asm, int(dummy.addresses()[0]+16*8))) X = drv.alloc((16, ), dtype = 'uint32') Y = drv.alloc((16, ), dtype = 'uint32') unif = drv.alloc(3, dtype = 'uint32') X[:] = np.arange(16) Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0] start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() assert (Y == X + 2).all()
def memset(*, fill, length, num_qpus=8, unroll_shift=1): assert length > 0 assert length % (16 * num_qpus * (1 << unroll_shift)) == 0 print(f'==== memset example ({length * 4 / 1024 / 1024} MiB) ====') with Driver(data_area_size=(length + 1024) * 4) as drv: code = drv.program(qpu_memset, num_qpus=num_qpus, unroll_shift=unroll_shift, code_offset=drv.code_pos // 8) print('Preparing for buffers...') X = drv.alloc(length, dtype='uint32') X.fill(~fill) assert not np.array_equiv(X, fill) unif = drv.alloc(3, dtype='uint32') unif[0] = X.addresses()[0] unif[1] = fill unif[2] = length print('Executing on QPU...') start = monotonic() drv.execute(code, unif.addresses()[0], thread=num_qpus) end = monotonic() assert np.array_equiv(X, fill) print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
def test_uniform_branch_reg(): with Driver() as drv: code = drv.program(qpu_uniform_branch_reg) X = drv.alloc((16, ), dtype='uint32') Y = drv.alloc((16, ), dtype='uint32') unif = drv.alloc(6, dtype='uint32') X[1] = unif.addresses()[4] # absolute address for uniform branch Y[:] = 0.0 unif[0] = X.addresses()[0] unif[1] = Y.addresses()[0] unif[2] = 3 unif[3] = 4 unif[4] = 5 # uniform branch point here unif[5] = 6 start = time.time() drv.execute(code, unif.addresses()[0]) end = time.time() assert (Y == 5).all()
def sgemm_rnn_naive(): thread = 8 P = 1024 Q = 1024 R = 1024 assert P % (16 * 2) == 0 assert R % (16 * 4) == 0 with Driver() as drv: code = drv.program(lambda asm: qpu_sgemm_rnn_naive(asm, thread)) A = drv.alloc((P, Q), dtype='float32') B = drv.alloc((Q, R), dtype='float32') C = drv.alloc((P, R), dtype='float32') np.random.seed(0) alpha = np.random.randn() beta = np.random.randn() A_ref = np.random.randn(*A.shape).astype(A.dtype) B_ref = np.random.randn(*B.shape).astype(B.dtype) C_ref = np.random.randn(*C.shape).astype(C.dtype) A[:] = A_ref B[:] = B_ref C[:] = C_ref start = getsec() C_ref[:] = alpha * A_ref.dot(B_ref) + beta * C_ref time_ref = getsec() - start def block_2x4_params(i, j): tile_P = P // 2 tile_R = R // 4 return [ tile_P, Q, tile_R, A.addresses()[tile_P * i, 0], A.strides[0], B.addresses()[0, tile_R * j], B.strides[0], C.addresses()[tile_P * i, tile_R * j], C.strides[0], *pack_unpack('f', 'I', [alpha, beta]), ] unif_params = drv.alloc((thread, len(block_2x4_params(0, 0))), dtype='uint32') for th in range(thread): unif_params[th] = block_2x4_params(th // 4, th % 4) unif = drv.alloc(2, dtype='uint32') unif[0] = unif_params.addresses()[0, 0] unif[1] = unif_params.shape[1] start = getsec() drv.execute(code, unif.addresses()[0], thread=thread) time_gpu = getsec() - start np.set_printoptions(threshold=np.inf) # print(C) # print(C-C_ref) def Gflops(sec): return (2 * P * Q * R + 3 * P * R) / sec * 1e-9 print(f'==== sgemm example ({P}x{Q} times {Q}x{R}) ====') print(f'numpy: {time_ref:.4} sec, {Gflops(time_ref):.4} Gflop/s') print(f'QPU: {time_gpu:.4} sec, {Gflops(time_gpu):.4} Gflop/s') print(f'Minimum absolute error: {np.min(np.abs(C - C_ref))}') print(f'Maximum absolute error: {np.max(np.abs(C - C_ref))}') print(f'Minimum relative error: {np.min(np.abs((C - C_ref) / C_ref))}') print(f'Maximum relative error: {np.max(np.abs((C - C_ref) / C_ref))}')
def benchmark(): from time import monotonic import numpy as np from videocore6.driver import Driver def run(drv, unif, src, dst, num_qpus, rows, cols, tile_rows, tile_cols, subtile_rows, subtile_cols, code_offset=0): code = drv.program(qpu_comatcopy_t, num_qpus=num_qpus, tile_rows=tile_rows, tile_cols=tile_cols, subtile_rows=subtile_rows, subtile_cols=subtile_cols, code_offset=code_offset) src[:, :] = np.arange(src.size, dtype=src.dtype).reshape(src.shape) dst[:, :] = np.arange(dst.size, dtype=dst.dtype).reshape(dst.shape) unif[0] = rows unif[1] = cols unif[2] = pack_unpack('f', 'I', 1.) unif[3] = pack_unpack('f', 'I', 0.) unif[4] = src.addresses()[0, 0] unif[5] = cols * 8 unif[6] = dst.addresses()[0, 0] unif[7] = rows * 8 start = monotonic() drv.execute(code, unif.addresses()[0], thread=num_qpus) end = monotonic() print(f'{num_qpus} QPUs,', f'{rows} x {cols} matrix,', f'{tile_rows:2} x {tile_cols:2} tile,', f'{subtile_rows:2} x {subtile_cols:2} subtile:', f'{end - start} seconds,', f'{rows * cols * 8 / (end - start) * 1e-6} MB/s') rows = 8192 cols = 8192 with Driver(data_area_size=1100 * 1024 * 1024) as drv: unif = drv.alloc(8, dtype='uint32') src = drv.alloc((rows, cols), dtype='uint64') dst = drv.alloc((cols, rows), dtype='uint64') for num_qpus in [1, 8]: run(drv, unif, src, dst, num_qpus, rows, cols, 4, 8, 4, 4) for tile_rows in [2, 4, 8, 16]: tile_cols = 32 // tile_rows for subtile_rows in [2, 4, 8]: subtile_cols = 16 // subtile_rows run(drv, unif, src, dst, 8, rows, cols, tile_rows, tile_cols, subtile_rows, subtile_cols)
def test_multiple_dispatch_delay(): print() bench = BenchHelper('benchmarks/libbench_helper.so') with Driver() as drv: data = drv.alloc((10, 16), dtype='uint32') code = [ drv.program(lambda asm: qpu_write_N(asm, i)) for i in range(data.shape[0]) ] unif = drv.alloc((data.shape[0], 2), dtype='uint32') done = drv.alloc(1, dtype='uint32') data[:] = 0 unif[:, 0] = data.addresses()[:, 0] unif[:, 1] = done.addresses()[0] ref_start = time.time() with drv.compute_shader_dispatcher() as csd: for i in range(data.shape[0]): csd.dispatch(code[i], unif.addresses()[i, 0]) ref_end = time.time() assert (data == np.arange(data.shape[0]).reshape(data.shape[0], 1)).all() data[:] = 0 naive_results = np.zeros(data.shape[0], dtype='float32') with drv.compute_shader_dispatcher() as csd: for i in range(data.shape[0]): done[:] = 0 start = time.time() csd.dispatch(code[i], unif.addresses()[i, 0]) bench.wait_address(done) end = time.time() naive_results[i] = end - start assert (data == np.arange(data.shape[0]).reshape(data.shape[0], 1)).all() sleep_results = np.zeros(data.shape[0], dtype='float32') with drv.compute_shader_dispatcher() as csd: for i in range(data.shape[0]): done[:] = 0 time.sleep(1) start = time.time() csd.dispatch(code[i], unif.addresses()[i, 0]) bench.wait_address(done) end = time.time() sleep_results[i] = end - start assert (data == np.arange(data.shape[0]).reshape(data.shape[0], 1)).all() print print( f'API wait after {data.shape[0]} dispatch: {ref_end - ref_start:.6f} sec' ) print(f'polling wait for each {data.shape[0]} dispatch:') print(f' total: {np.sum(naive_results):.6f} sec') print(f' details: {" ".join([f"{t:.6f}" for t in naive_results])}') print( f'polling wait for each {data.shape[0]} dispatch with between sleep:' ) print(f' total: {np.sum(sleep_results):.6f} sec + sleep...') print(f' details: {" ".join([f"{t:.6f}" for t in sleep_results])}')
def test_tmu_load_1_slot_1_qpu(): bench = BenchHelper('benchmarks/libbench_helper.so') for trans in [False, True]: with Driver() as drv: loop = 2**15 X = drv.alloc((16, loop) if trans else (loop, 16), dtype='float32') Y = drv.alloc(16, dtype='float32') unif = drv.alloc(6, dtype='uint32') done = drv.alloc(1, dtype='uint32') unif[0] = loop unif[1] = X.addresses()[0, 0] unif[2] = X.strides[int(trans)] unif[3] = X.strides[1 - int(trans)] unif[4] = Y.addresses()[0] unif[5] = done.addresses()[0] results = np.zeros((24, 10), dtype='float32') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_title( f'TMU load latency (1 slot, 1 qpu, stride=({unif[2]},{unif[3]}))' ) ax.set_xlabel('# of nop (between request and load signal)') ax.set_ylabel('sec') print() for nops in range(results.shape[0]): code = drv.program( lambda asm: qpu_tmu_load_1_slot_1_qpu(asm, nops)) for i in range(results.shape[1]): with drv.compute_shader_dispatcher() as csd: X[:] = np.random.randn(*X.shape) / X.shape[int(trans)] Y[:] = 0.0 done[:] = 0 start = time.time() csd.dispatch(code, unif.addresses()[0], thread=8) bench.wait_address(done) end = time.time() results[nops, i] = end - start assert np.allclose(Y, np.sum(X, axis=int(trans)), atol=1e-4) ax.scatter(np.zeros(results.shape[1]) + nops, results[nops], s=1, c='blue') print('{:4}/{}\t{:.9f}'.format( nops, results.shape[0], np.sum(results[nops]) / results.shape[1])) ax.set_ylim(auto=True) ax.set_xlim(0, results.shape[0]) fig.savefig( f'benchmarks/tmu_load_1_slot_1_qpu_{unif[2]}_{unif[3]}.png')