timePointKernel.append(time.time()) # copy from FPGA freq = gemx.getFreq() test.test_perf(timePointKernel, total_operations, total_parallel_operations, freq, 0, 0, 0) if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max( n_size) > 4096: print("Skip golden comparision because large matrix size") else: test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale) if __name__ == '__main__': np.random.seed(123) # for reproducibility test = GemmTest() parser = gemx.processCommandLine() args = parser.parse_args() gemx.createGEMMHandle(args.xclbin, args.gemxlib, args.device, args.numKernel) m_size = np.array([512, 512, 2048, 128]) k_size = np.array([384, 512, 512, 2048]) n_size = np.array([128, 128, 128, 128]) test_perf_multi_gemm(4, m_size, k_size, n_size, 32764, 32764, [1, 0]) # run performance measurement gemx.printStats() # size = 256 # while size < 16384: # test_perf(32764, 32764, 0, size, size, size, [1,0]) # size = size * 2
gemx.addFCNOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1], 1, 0) gemx.execute() gemx.clearInstrBuf() gemx.getMat(mat_C[0]) gemx.getMat(mat_C[1]) gemx.getMat(mat_C[2]) gemx.getMat(mat_C[3]) test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale) if __name__ == '__main__': np.random.seed(123) # for reproducibility test = FcnTest() args, xclbin_opts = gemx.processCommandLine() gemx.createFCNHandle(args, xclbin_opts) if xclbin_opts["GEMX_dataType"] == "short": for j in range(1, 3): for k in range(1, 8): for i in range(int(xclbin_opts["GEMX_numKernels"])): for m, n in ([0, 0], [1, 0]): test.test_basic_randint(i, xclbin_opts, [j, k], [m, n], 2048) test.test_basic_size(512, 512, 512, xclbin_opts) size = 256 while size < 8192: test_perf_fcn(size, size, size, xclbin_opts) # run performance measurement