def test_basic(self, PE, mat_A, mat_B, bias, post_scale=[1, 1], RELU_scale=[1, 0]): m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] print("test Fcn") print("test_basic: %d %d %d %d %d" % (m, k, n, post_scale[0], post_scale[1])) print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A)) print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B)) print("bias: ", np.amax(bias), np.amin(bias), np.average(bias)) C_fpga = np.zeros((m, n), dtype=np.int16, order='C') xfmlp.sendMat(mat_A, PE) xfmlp.sendMat(mat_B, PE) xfmlp.sendMat(C_fpga, PE) xfmlp.sendMat(bias, PE) xfmlp.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], RELU_scale[0], RELU_scale[1], PE) xfmlp.execute(PE) xfmlp.clearInstrBuf(PE) xfmlp.getMat(C_fpga, PE) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale, RELU_scale)
def test_multiInstrv1(int_range, m, k, n, add_bias=False): print ("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n)) A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16) B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16) C = np.zeros ((m, n), dtype=np.int16); D = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16) E = np.zeros ((m, n), dtype=np.int16); b0 = np.zeros ((m, n), dtype=np.int32); b1 = np.zeros ((m, n), dtype=np.int32); if add_bias == True: b0 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32) b1 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32) xfmlp.sendMat(A) xfmlp.sendMat(B) xfmlp.sendMat(b0) xfmlp.sendMat(C) xfmlp.sendMat(D) xfmlp.sendMat(E) xfmlp.sendMat(b1) xfmlp.addFCNOp(A, B, C, b0, 1, 13, 307, 10) xfmlp.addFCNOp(D, C, E, b1, 1, 18, 307, 10) xfmlp.execute() xfmlp.clearInstrBuf() xfmlp.getMat(C) xfmlp.getMat(E) print("test C") test.multiply_and_cmp(C, A, B, b0, m, n, [1, 13],[307, 10]) print("test E") test.multiply_and_cmp(E, D, C, b1, m, n, [1, 18],[307, 10])
def test_perf_gemm_gemm(A_range, B_range, bias_range, m, k, n, post_scale): mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros ((m, n), dtype=np.int32, order='C'); C_fpga = np.zeros( (m, n), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time xfmlp.sendMat(mat_A) xfmlp.sendMat(mat_B) xfmlp.sendMat(C_fpga) xfmlp.sendMat(bias) xfmlp.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA xfmlp.execute() timePointKernel.append(time.time()) # call kernel xfmlp.getMat(C_fpga) timePointKernel.append(time.time()) # copy from FPGA total_operations = 2 * m * n * k + m * n * 3 total_parallel_operations = 2 * m * n * k freq = xfmlp.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n) if m > 4096 and n > 4096 and k > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_gemm(m, k, n, A_range=32764, B_range=32764, bias_range=32764, post_scale=[1,0]): mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros ((m, n), dtype=np.int32, order='C'); C_fpga = np.zeros( (m, n), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time xfmlp.sendMat(mat_A) xfmlp.sendMat(mat_B) xfmlp.sendMat(C_fpga) xfmlp.sendMat(bias) xfmlp.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA xfmlp.execute() xfmlp.clearInstrBuf() timePointKernel.append(time.time()) # call kernel xfmlp.getMat(C_fpga) timePointKernel.append(time.time()) # copy from FPGA total_operations = 2 * m * n * k + m * n * 3 total_parallel_operations = 2 * m * n * k freq = xfmlp.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n) test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_textfiles(self, path_to_a, path_to_b, path_to_bias, post_scale): mat_A = np.loadtxt(path_to_a, dtype=np.int16) mat_B = np.loadtxt(path_to_b, dtype=np.int16) bias = np.loadtxt(path_to_bias, dtype=np.int32) m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] C_fpga = np.zeros((m, n), dtype=np.int16, order='C') xfmlp.sendMat(mat_A) xfmlp.sendMat(mat_B) xfmlp.sendMat(C_fpga) xfmlp.sendMat(bias) xfmlp.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], 1, 0) xfmlp.execute() xfmlp.clearInstrBuf() xfmlp.getMat(C_fpga) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def common_spmv(row, col, data, m, k, nnz, vector_range): if xclbin_opts["GEMX_dataType"] == "float": data_type = np.float32 elif xclbin_opts["GEMX_dataType"] == "int32_t": data_type = np.int32 else: raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported") ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) min_k = ddrWidth if xclbin_opts["GEMX_useURAM"] == "1": min_nnz = ddrWidth min_m = ddrWidth * int(xclbin_opts["GEMX_spmvUramGroups"]) else: spmvWidth = int(xclbin_opts["GEMX_spmvWidth"]) min_nnz = spmvWidth min_m = spmvWidth * int(xclbin_opts["GEMX_spmvMacGroups"]) while nnz % min_nnz != 0: # pad with 0s and adjust dimensions when necessary row = (np.append(row, 0)).astype(np.int32) col = (np.append(col, 0)).astype(np.int32) data = (np.append(data, 0)).astype(np.float32) nnz = nnz + 1 m = get_padded_size(m, min_m) k = get_padded_size(k, min_k) print("size:", m, k, "nnz:", nnz) if data_type == np.int32: B = np.random.randint(low=-vector_range, high=vector_range, size=(k, 1), dtype=np.int32) else: B = np.zeros((k, 1), dtype=np.float32) test.fillMod(B, k, vector_range) C = np.zeros((m, 1), dtype=data_type) A = xfmlp.sendSpMat(row, col, data, m, k, nnz, xclbin_opts, data_type) xfmlp.sendMat(B) xfmlp.sendMat(C) xfmlp.addSPMVOp(A, B, C, nnz, xclbin_opts) xfmlp.execute() xfmlp.clearInstrBuf() xfmlp.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
def test_basic(self, PE, mat_A, mat_B, bias, post_scale=[1, 1]): m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] print("test_basic(PE=%d): %d %d %d %d %d" % (PE, m, k, n, post_scale[0], post_scale[1])) print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A)) print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B)) print("bias: ", np.amax(bias), np.amin(bias), np.average(bias)) C_fpga = np.zeros((m, n), dtype=np.int16) xfmlp.sendMat(mat_A, PE) xfmlp.sendMat(mat_B, PE) xfmlp.sendMat(C_fpga, PE) xfmlp.sendMat(bias, PE) xfmlp.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], PE) # default test_basic will call addGEMMOp xfmlp.execute(PE) xfmlp.clearInstrBuf(PE) xfmlp.getMat(C_fpga, PE) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale): total_operations = 0 total_parallel_operations = 0 mat_A=[] mat_C=[] mat_bias=[] for i in range(ins_count): total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3 total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i] mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time for i in range(ins_count): xfmlp.sendMat(mat_A[i]) xfmlp.sendMat(mat_C[i]) xfmlp.sendMat(mat_bias[i]) xfmlp.sendMat(mat_B0) xfmlp.addGEMMOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1]) xfmlp.addGEMMOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1]) xfmlp.addGEMMOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1]) xfmlp.addGEMMOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA xfmlp.execute() timePointKernel.append(time.time()) # call kernel xfmlp.getMat(mat_C[0]) xfmlp.getMat(mat_C[1]) xfmlp.getMat(mat_C[2]) xfmlp.getMat(mat_C[3]) timePointKernel.append(time.time()) # copy from FPGA freq = xfmlp.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0) if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max(n_size) > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
print(m, k, n) A_buf.append(np.zeros((m, k), dtype=np.int16, order='C')) bias_buf.append(np.zeros((m, n), dtype=np.int32, order='C')) B_buf.append(np.zeros((k, n), dtype=np.int16, order='C')) C_buf.append(np.zeros((m, n), dtype=np.int16, order='C')) for i in range(num_matrix): xfmlp.sendMat(B_buf[i]) xfmlp.sendMat(A_buf[i]) xfmlp.sendMat(C_buf[i]) xfmlp.sendMat(bias_buf[i]) time.sleep(2) total_time = 0 for k in range(args.numiter): start_time = time.time() xfmlp.sendMat(B_buf[0]) for i in range(num_matrix): #xfmlp.addFCNOp(A_buf[i], B_buf[i], B_buf[i+1], bias_buf[i], 1,0,1,0 ) xfmlp.addFCNOp(A_buf[i], B_buf[i], C_buf[i], bias_buf[i], 1, 0, 1, 0) xfmlp.execute() xfmlp.getMat(C_buf[num_matrix - 1]) #xfmlp.wait() total_time += time.time() - start_time print("Average FPGA exec time(python): ", (total_time / args.numiter) * 1000, " ms") xfmlp.printStats()