def __init__(self, xclbin_opt, wgt, bias, wgt_scale, post_scale): #Ensuring min_m and min_n never fall below min_k is needed when chaining multiple GEMM operations #If min_m/min_n is less than min_k, using the output of a GEMM call where either dimension #is less than min_k would lead to bad results if it's directly used as input for another GEMM operation self.min_m = 32 * max(int(xclbin_opt["GEMX_gemmKBlocks"]), int(xclbin_opt["GEMX_gemmMBlocks"])) self.min_k = 32 * int(xclbin_opt["GEMX_gemmKBlocks"]) self.min_n = 32 * max(int(xclbin_opt["GEMX_gemmKBlocks"]), int(xclbin_opt["GEMX_gemmNBlocks"])) if type(wgt) != list: wgt = [wgt] if type(bias) != list: bias = [bias] self._wshape = [] for w in wgt: self._wshape.append(w.shape) self._qw = [np.int16(a * b) for a, b in zip(wgt, wgt_scale)] self._qb = [np.int32(a * b) for a, b in zip(bias, wgt_scale)] for i, b in enumerate(self._qw): self._qw[i] = self.format_for_fpga(b, self.min_k, self.min_n) gemx.sendMat(self._qw[i]) #in_row, in_col = self.get_padded_shape(in_dim, self.min_m, self.min_k) self.fpga_buf = [] self.out_dim = None self.post_scale = post_scale self.batch_sz = 0
def format_bias(self, b, dim, min_row, min_col): if b.ndim == 1: b = np.broadcast_to(b, dim) b = self.format_for_fpga(b, min_row, min_col) gemx.sendMat(b) return b
def common_uspmv(rows,cols,datas,m_sizes,k_sizes,nnz_sizes, num_runs,vector_range): ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) min_k = ddrWidth min_m = ddrWidth * int(xclbin_opts["GEMX_uspmvInterleaves"]) for i in range(len(m_sizes)): m_sizes[i] = test.get_padded_size (m_sizes[i], min_m) k_sizes[i] = test.get_padded_size (k_sizes[i], min_m) print ("size:",m_sizes,k_sizes,"nnz:",nnz_sizes) B = np.zeros((num_runs, k_sizes[i]), dtype=np.float32) test.fillMod(9, num_runs, k_sizes[i],B) B = B.astype(np.float32) C_list=[B] for i in range(len(m_sizes)): C = np.zeros ((num_runs, m_sizes[i]), dtype=np.float32) C_list.append(C) A = gemx.sendUSpMat(np.array(rows[i]).astype(np.uint16), np.array(cols[i]).astype(np.uint16), np.array(datas[i]), np.array(m_sizes[i],dtype=np.int32), np.array(k_sizes[i],dtype=np.int32), np.array(nnz_sizes[i],dtype=np.int32), np.array(1,dtype=np.float32), xclbin_opts) gemx.sendMat(C_list[i]) gemx.sendMat(C_list[i+1]) gemx.addUSPMVOp(A,C_list[i],C_list[i+1],num_runs) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C_list[-1]) test.multiply_and_cmp_uspmv(rows,cols,datas,m_sizes,k_sizes,B,C_list[-1])
def common_spmv(row, col, data, m, k, nnz, vector_range): if xclbin_opts["GEMX_dataType"] == "float": data_type = np.float32 elif xclbin_opts["GEMX_dataType"] == "int32_t": data_type = np.int32 else: raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported") ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) min_k = ddrWidth spmvWidth = int(xclbin_opts["GEMX_spmvWidth"]) min_m = spmvWidth * int(xclbin_opts["GEMX_spmvMacGroups"]) m = get_padded_size(m, min_m) k = get_padded_size(k, min_k) print("size:", m, k, "nnz:", nnz) if data_type == np.int32: B = np.random.randint(low=-vector_range, high=vector_range, size=(k, 1), dtype=np.int32) else: B = np.zeros((k, 1), dtype=np.float32) test.fillMod(B, k, vector_range) C = np.zeros((m, 1), dtype=data_type) A = gemx.sendSpMat(row, col, data, m, k, nnz, xclbin_opts) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(A, B, C, nnz, xclbin_opts) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
def format_bias(self, b, dim, min_row, min_col): if b.ndim == 1: b = np.broadcast_to(b, (dim[1], dim[0])) b = np.transpose(b) b = self.format_for_fpga(b, min_row, min_col) gemx.sendMat(b) return b
def predict ( self, inp): self.out_dim = (inp.shape[0],self.out_dim[1]) inp = self.format_for_fpga(inp, 1, 1) B = inp.astype(np.float32) gemx.sendMat(B) C = np.zeros ((inp.shape[0], self.sizes[0][0]), dtype=np.float32) gemx.sendMat(C) gemx.addUSPMVOp(self.A_list[0],B,C,inp.shape[0]) gemx.execute() gemx.getMat(C) gemx.clearInstrBuf() result = C return result[:self.out_dim[0],:self.out_dim[1]]
def test_basic(self, PE, mat_A, mat_B, bias, post_scale=[1, 1], RELU_scale=[1, 0]): m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] print("test Fcn") print("test_basic: %d %d %d %d %d" % (m, k, n, post_scale[0], post_scale[1])) print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A)) print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B)) print("bias: ", np.amax(bias), np.amin(bias), np.average(bias)) C_fpga = np.zeros((m, n), dtype=np.int16, order='C') gemx.sendMat(mat_A, PE) gemx.sendMat(mat_B, PE) gemx.sendMat(C_fpga, PE) gemx.sendMat(bias, PE) gemx.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], RELU_scale[0], RELU_scale[1], PE) gemx.execute(PE) gemx.clearInstrBuf(PE) gemx.getMat(C_fpga, PE) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale, RELU_scale)
def test_perf_fcn(A_range, B_range, bias_range, m, k, n, post_scale): mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros ((m, n), dtype=np.int32, order='C'); C_fpga = np.zeros( (m, n), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addFCNOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1],1,0) timePointKernel.append(time.time()) # send to FPGA gemx.execute() timePointKernel.append(time.time()) # call kernel gemx.getMat(C_fpga) timePointKernel.append(time.time()) # copy from FPGA total_operations = 2 * m * n * k + m * n * 3 total_parallel_operations = 2 * m * n * k freq = gemx.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n) if m > 4096 and n > 4096 and k > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_gemm(m, k, n, xclbin_opts, post_scale=[1,0], A_range=32764, B_range=32764, bias_range=32764): ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) m = test.get_padded_size(m, int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth) k = test.get_padded_size(k, int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth) n = test.get_padded_size(n, int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth) mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros ((m, n), dtype=np.int32, order='C'); C_fpga = np.zeros( (m, n), dtype=np.int16) start_time = time.time() gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1]) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C_fpga) end_time = time.time() total_operations = 2 * m * n * k + m * n * 3 test.test_perf(end_time-start_time,total_operations,m,k,n,ddrWidth) test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_basic(self, PE, xclbin_opts, mat_A, mat_B, bias, post_scale=[1, 0]): m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] print("test_basic(PE=%d): %d %d %d %d %d" % (PE, m, k, n, post_scale[0], post_scale[1])) print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A)) print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B)) print("bias: ", np.amax(bias), np.amin(bias), np.average(bias)) if xclbin_opts["GEMX_dataType"] == "short": C_fpga = np.zeros((m, n), dtype=np.int16, order='C') else: #float C_fpga = np.zeros((m, n), dtype=np.float32, order='C') gemx.sendMat(mat_A, PE) gemx.sendMat(mat_B, PE) gemx.sendMat(C_fpga, PE) gemx.sendMat(bias, PE) gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], PE) # default test_basic will call addGEMMOp gemx.execute(PE) gemx.clearInstrBuf(PE) gemx.getMat(C_fpga, PE) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def common_spmv(row, col, data, m, k, nnz, vector_range): ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) if xclbin_opts["GEMX_dataType"] == "float": dtype = np.float32 elif xclbin_opts["GEMX_dataType"] == "int32_t": dtype = np.int32 else: raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported") if dtype == np.int32: B = np.random.randint(low=-vector_range, high=vector_range, size=(k, 1), dtype=np.int32) C = np.zeros((m, 1), dtype=np.int32) A = gemx.sendSpMat(row, col, data, ddrWidth, dtype) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(A, B, C, nnz) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C) elif dtype == np.float32: B = np.zeros((k, 1), dtype=np.float32) test.fillMod(B, k, vector_range) C = np.zeros((m, 1), dtype=np.float32) A = gemx.sendSpMat(row, col, data, ddrWidth, dtype) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(A, B, C, nnz) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
def common_spmv(row, col, data, m, k, nnz, vector_range, dtype): if dtype == np.int32: B = np.random.randint(low=-vector_range, high=vector_range, size=(k, 1), dtype=np.int32) C = np.zeros((m, 1), dtype=np.int32) A = gemx.sendSpMat(row, col, data, nnz, dtype) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(A, B, C, nnz) gemx.execute() gemx.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C) elif dtype == np.float32: B = np.zeros((k, 1), dtype=np.float32) test.fillMod(B, k, vector_range) C = np.zeros((m, 1), dtype=np.float32) A = gemx.sendSpMat(row, col, data, nnz, dtype) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(A, B, C, nnz) gemx.execute() gemx.getMat(C) test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C) else: raise TypeError("type", dtype, "not supported")
def predict(self, inp, in_scale): self.init_fpgabuf(inp.shape) self.loadInstr() padded_arr = self.format_for_fpga(inp * in_scale, self.min_m, self.min_k) #print ("input shape", padded_arr.shape) np.copyto(self.fpga_buf[0], np.int16(padded_arr), casting='same_kind', where=True) gemx.sendMat(self.fpga_buf[0]) gemx.execute() gemx.getMat(self.fpga_buf[-1]) return self.fpga_buf[-1][:self.out_dim[0], :self.out_dim[1]]
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale): total_operations = 0 total_parallel_operations = 0 mat_A = [] mat_C = [] mat_bias = [] for i in range(ins_count): total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[ i] * n_size[i] * 3 total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i] mat_A.append( np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append( np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time for i in range(ins_count): gemx.sendMat(mat_A[i]) gemx.sendMat(mat_C[i]) gemx.sendMat(mat_bias[i]) gemx.sendMat(mat_B0) gemx.addGEMMOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1]) gemx.addGEMMOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA gemx.execute() timePointKernel.append(time.time()) # call kernel gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) timePointKernel.append(time.time()) # copy from FPGA freq = gemx.getFreq() test.test_perf(timePointKernel, total_operations, total_parallel_operations, freq, 0, 0, 0) if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max( n_size) > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
def predict ( self, inp, in_scale, post_scale): row_padded, col_padded = self.get_padded_shape( inp.shape, self.min_m, self.min_k) padded_arr = np.zeros ( (row_padded, col_padded), dtype=inp.dtype, order='C') padded_arr[0:inp.shape[0], 0:inp.shape[1]] = inp print ("input shape", padded_arr.shape) np.copyto(self.fpga_buf[0], np.int16( padded_arr * in_scale ), casting='same_kind', where=True) gemx.sendMat(self.fpga_buf[0]) for i,l in enumerate(self.kmodel.layers): act = l.get_config()['activation'] if act == 'relu': gemx.addFCNOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1], 0, 0) else: gemx.addGEMMOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1]) gemx.execute() gemx.getMat (self.fpga_buf[-1]) return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]
def test_multi_fcn(ins_count, m_size, k_size, n_size, post_scale=[1, 0], A_range=32764, B_range=32764): mat_A = [] mat_C = [] mat_bias = [] ddrWidth = int(xclbin_opts["GEMX_ddrWidth"]) for i in range(ins_count): m_size[i] = test.get_padded_size( m_size[i], int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth) k_size[i] = test.get_padded_size( k_size[i], int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth) n_size[i] = test.get_padded_size( n_size[i], int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth) mat_A.append( np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append( np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) for i in range(ins_count): gemx.sendMat(mat_A[i]) gemx.sendMat(mat_C[i]) gemx.sendMat(mat_bias[i]) gemx.sendMat(mat_B0) gemx.addFCNOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1], 1, 0) gemx.addFCNOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1], 1, 0) gemx.addFCNOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1], 1, 0) gemx.addFCNOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1], 1, 0) gemx.execute() gemx.clearInstrBuf() gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
def __init__(self, xclbin_opts, wgt, bias, wgt_scale, bias_scale, post_scale, relu_scale): #Ensuring min_m and min_n never fall below min_k is needed when chaining multiple GEMM operations #If min_m/min_n is less than min_k, using the output of a GEMM call where either dimension #is less than min_k would lead to bad results if it's directly used as input for another GEMM operation ddrwidth = int(xclbin_opts["GEMX_ddrWidth"]) self.min_m = ddrwidth * max(int(xclbin_opts["GEMX_gemmKBlocks"]), int(xclbin_opts["GEMX_gemmMBlocks"])) self.min_k = ddrwidth * int(xclbin_opts["GEMX_gemmKBlocks"]) self.min_n = ddrwidth * int(xclbin_opts["GEMX_gemmNBlocks"]) if type(wgt) != list: wgt = [wgt] if type(bias) != list: bias = [bias] assert len(wgt) == len(wgt_scale) assert len(bias) == len(bias_scale) self._wshape = [] for w in wgt: self._wshape.append(w.shape) if xclbin_opts["GEMX_dataType"] == "float": self._qw = wgt self._qb = bias else: self._qw = [ np.int16(np.around(a * b)) for a, b in zip(wgt, wgt_scale) ] self._qb = [ np.int32(np.around(a * b)) for a, b in zip(bias, bias_scale) ] for i, b in enumerate(self._qw): b = np.transpose(b) self._qw[i] = self.format_for_fpga(b, self.min_m, self.min_k) gemx.sendMat(self._qw[i]) #in_row, in_col = self.get_padded_shape(in_dim, self.min_m, self.min_k) self.fpga_buf = [] self.out_dim = None self.post_scale = post_scale self.relu_scale = relu_scale self.batch_sz = 0
def predict(self, inp, in_scale): """ prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix Parameters ---------- inp: array input matrix in_scale: float input scale Return ------ array result prediction matrix """ inp = np.transpose(inp) self.init_fpgabuf(inp.shape) self.loadInstr() padded_arr = self.format_for_fpga(inp * in_scale, self.min_k, self.min_n) #print ("input shape", padded_arr.shape) if self.fpga_buf[0].dtype == np.int16: np.copyto(self.fpga_buf[0], np.int16(padded_arr), casting='same_kind', where=True) else: np.copyto(self.fpga_buf[0], padded_arr, casting='same_kind', where=True) gemx.sendMat(self.fpga_buf[0]) gemx.execute() gemx.getMat(self.fpga_buf[-1]) return np.transpose( self.fpga_buf[-1][:self.out_dim[0], :self.out_dim[1]])
def test_textfiles(self, path_to_a, path_to_b, path_to_bias,post_scale): mat_A = np.loadtxt(path_to_a, dtype=np.int16) mat_B = np.loadtxt(path_to_b, dtype=np.int16) bias = np.loadtxt(path_to_bias, dtype=np.int32) m = mat_A.shape[0] n = mat_B.shape[1] C_fpga = np.zeros((m, n), dtype=np.int16, order='C') gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addFCNOp (mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], 1, 0) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C_fpga) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def predict(self, inp, in_scale): C_list = [[]] * (len(self.kmodel.layers) + 1) inp = self.format_for_fpga(inp, self.min_m, self.min_m) C_list[0] = np.transpose(inp * in_scale) for i, bi in enumerate(self._qb): bi = bi.reshape(bi.shape[0], 1) bi = self.format_for_fpga(bi, C_list[i].shape[1], self.sizes[i][0]) for j in range(C_list[i].shape[1]): B = (C_list[i][:, j]).astype(np.float32) C = np.zeros((self.sizes[i][0], 1), dtype=np.float32) gemx.sendMat(B) gemx.sendMat(C) gemx.addSPMVOp(self.A_list[i], B, C, self.sizes[i][2]) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C) if j == 0: C_list[i + 1] = C else: C_list[i + 1] = np.append(C_list[i + 1], C, axis=1) C_list[i + 1] = C_list[i + 1] + np.transpose(bi) result = np.transpose(C_list[-1]) return result[:self.out_dim[0], :self.out_dim[1]]
def test_basic(self,PE, mat_A, mat_B, bias, post_scale = [1,1]): m = mat_A.shape[0] k = mat_A.shape[1] n = mat_B.shape[1] print ("test_basic(PE=%d): %d %d %d %d %d" % (PE,m, k, n, post_scale[0], post_scale[1] )) print ("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A)) print ("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B)) print ("bias: ", np.amax(bias), np.amin(bias), np.average(bias)) C_fpga = np.zeros( (m, n), dtype=np.int16) gemx.sendMat(mat_A,PE) gemx.sendMat(mat_B,PE) gemx.sendMat(C_fpga,PE) gemx.sendMat(bias, PE) gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], PE) # default test_basic will call addGEMMOp gemx.execute(PE) gemx.getMat(C_fpga,PE) self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_perf_gemm(m, k, n, A_range=32764, B_range=32764, bias_range=32764, post_scale=[1, 0]): mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16) mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16) bias = [] if bias_range != 0: bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32) else: bias = np.zeros((m, n), dtype=np.int32, order='C') C_fpga = np.zeros((m, n), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time gemx.sendMat(mat_A) gemx.sendMat(mat_B) gemx.sendMat(C_fpga) gemx.sendMat(bias) gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1]) timePointKernel.append(time.time()) # send to FPGA gemx.execute() gemx.clearInstrBuf() timePointKernel.append(time.time()) # call kernel gemx.getMat(C_fpga) timePointKernel.append(time.time()) # copy from FPGA total_operations = 2 * m * n * k + m * n * 3 total_parallel_operations = 2 * m * n * k freq = gemx.getFreq() test.test_perf(timePointKernel, total_operations, total_parallel_operations, freq, m, k, n) test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def predict(self, inp): """ prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix Parameters ---------- inp: array input matrix Return ------ array result prediction matrix """ C_list = [[]] * 2 inp = self.format_for_fpga(inp, 1, 1) C_list[0] = np.transpose(inp) B = (C_list[0][:, 0]).astype(np.float32) C_vector = [B] gemx.sendMat(C_vector[0]) for i, l in enumerate(self.kmodel.layers): C_vector.append(np.zeros((self.sizes[i][0], 1), dtype=np.float32)) gemx.sendMat(C_vector[i + 1]) activation = True if l.get_config( )['activation'] == 'relu' else False gemx.addSPMVOp(self.A_list[i], C_vector[i], C_vector[i + 1], self.sizes[i][2], self.xclbin_opts, activation) gemx.execute() gemx.getMat(C_vector[-1]) C_list[1] = C_vector[-1] for j in range(1, C_list[0].shape[1]): C_vector[0][:] = (C_list[0][:, j]).astype(np.float32) gemx.sendMat(C_vector[0]) C_vector[-1].fill(0) for i in range(len(self.kmodel.layers)): gemx.sendMat(C_vector[i + 1]) gemx.execute() gemx.getMat(C_vector[-1]) C_list[1] = np.append(C_list[1], C_vector[-1], axis=1) gemx.clearInstrBuf() result = np.transpose(C_list[1]) return result[:self.out_dim[0], :self.out_dim[1]]
def predict(self, inp): """ prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix Parameters ---------- inp: array input matrix Return ------ array result prediction matrix """ stage_size = int(self.xclbin_opts["GEMX_uspmvStages"]) layer_size = len(self._qw) if stage_size == 1: inp = self.format_for_fpga(inp, 1, self.min_m) C_list = [inp.astype(np.float32)] gemx.sendMat(C_list[0]) for i in range(layer_size): C_list.append( np.zeros((inp.shape[0], self.sizes[i][0]), dtype=np.float32)) gemx.sendMat(C_list[i + 1]) gemx.addUSPMVOp(self.A_list[i], C_list[i], C_list[i + 1], inp.shape[0]) else: inp = self.format_for_fpga(inp, 1, self.min_m) C_list = [inp.astype(np.float32)] gemx.sendMat(C_list[0]) C_end = np.zeros((inp.shape[0], self.sizes[-1][0]), dtype=np.float32) gemx.sendMat(C_end) gemx.addUSPMVOp(self.A_list[0], C_list[0], C_list[-1], inp.shape[0]) gemx.execute() gemx.getMat(C_list[-1]) gemx.clearInstrBuf() result = C_list[-1] return result[:self.out_dim[0], :self.out_dim[1]]
def test_perf_multi_fcn(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale): total_operations = 0 total_parallel_operations = 0 mat_A=[] mat_C=[] mat_bias=[] for i in range(ins_count): total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3 total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i] mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16)) mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32)) mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C')) mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) timePointKernel = [] timePointKernel.append(time.time()) # current time for i in range(ins_count): gemx.sendMat(mat_A[i]) gemx.sendMat(mat_C[i]) gemx.sendMat(mat_bias[i]) gemx.sendMat(mat_B0) gemx.addFCNOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1],1,0) gemx.addFCNOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1],1,0) timePointKernel.append(time.time()) # send to FPGA gemx.execute() gemx.clearInstrBuf() timePointKernel.append(time.time()) # call kernel gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) timePointKernel.append(time.time()) # copy from FPGA freq = gemx.getFreq() test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0) test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
def test_multiInstrv1(int_range, m, k, n, add_bias=False): print("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n)) A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16) B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16) C = np.zeros((m, n), dtype=np.int16) D = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16) E = np.zeros((m, n), dtype=np.int16) b0 = np.zeros((m, n), dtype=np.int32) b1 = np.zeros((m, n), dtype=np.int32) if add_bias == True: b0 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32) b1 = np.random.randint(low=-int_range, high=int_range, size=(m, n), dtype=np.int32) gemx.sendMat(A) gemx.sendMat(B) gemx.sendMat(b0) gemx.sendMat(C) gemx.sendMat(D) gemx.sendMat(E) gemx.sendMat(b1) gemx.addGEMMOp(A, B, C, b0, 1, 0) gemx.addGEMMOp(D, C, E, b1, 1, 0) gemx.execute() gemx.clearInstrBuf() gemx.getMat(C) gemx.getMat(E) print("test C") test.multiply_and_cmp(C, A, B, b0, m, n, [1, 0]) print("test E") test.multiply_and_cmp(E, D, C, b1, m, n, [1, 0])
int(xclbin_opt["GEMX_gemmKBlocks"]), int( xclbin_opt["GEMX_gemmNBlocks"])) if args.mtx == 'none': num_matrix = len(args.matrix) / 3 for i in range(num_matrix): m = args.matrix[i * 3] k = args.matrix[i * 3 + 1] n = args.matrix[i * 3 + 2] m = int(math.ceil(np.float32(m) / min_m) * min_m) k = int(math.ceil(np.float32(k) / min_k) * min_k) n = int(math.ceil(np.float32(n) / min_n) * min_n) print(m, k, n) A = np.zeros((m, k), dtype=np.int16) A.fill(1) A_buf.append(A) gemx.sendMat(A_buf[i]) B_buf.append(np.zeros((k, n), dtype=np.int16)) C_buf.append(np.zeros((m, n), dtype=np.int16)) bias = np.zeros((m, n), dtype=np.int32) bias.fill(1) bias_buf.append(bias) else: #For fcn if read from mtx files, matrix sizes still need to be provided num_matrix = len(args.mtx) if num_matrix != len(args.matrix) / 3: raise Exception("please enter sizes for each layer") for i in range(num_matrix): matA = sio.mmread(args.mtx[i]) m = args.matrix[i * 3] k = args.matrix[i * 3 + 1] n = args.matrix[i * 3 + 2]