Exemplo n.º 1
0
def test_perf_gemm(m, k, n, xclbin_opts, post_scale=[1,0], A_range=32764, B_range=32764, bias_range=32764):
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    m = test.get_padded_size(m, int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth)
    k = test.get_padded_size(k, int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth)
    n = test.get_padded_size(n, int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth)
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    start_time = time.time()  
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)    
    gemx.sendMat(bias)
    gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C_fpga)  
    end_time = time.time()
    total_operations = 2 * m * n * k + m * n * 3
    test.test_perf(end_time-start_time,total_operations,m,k,n,ddrWidth)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 2
0
 def test_basic(self,
                PE,
                xclbin_opts,
                mat_A,
                mat_B,
                bias,
                post_scale=[1, 0]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print("test_basic(PE=%d): %d %d %d %d %d" %
           (PE, m, k, n, post_scale[0], post_scale[1]))
     print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     if xclbin_opts["GEMX_dataType"] == "short":
         C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
     else:  #float
         C_fpga = np.zeros((m, n), dtype=np.float32, order='C')
     gemx.sendMat(mat_A, PE)
     gemx.sendMat(mat_B, PE)
     gemx.sendMat(C_fpga, PE)
     gemx.sendMat(bias, PE)
     gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0],
                    post_scale[1],
                    PE)  # default test_basic will call addGEMMOp
     gemx.execute(PE)
     gemx.clearInstrBuf(PE)
     gemx.getMat(C_fpga, PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 3
0
    def loadInstr(self):
        gemx.clearInstrBuf()

        for i,l in enumerate(self.kmodel.layers):
            act = l.get_config()['activation']
            if act == 'relu':
                gemx.addFCNOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1], 0, 0)
            else:
                gemx.addGEMMOp( self.fpga_buf[i], self._qw[i], self.fpga_buf[i+1], self._qb[i], self.post_scale[i][0], self.post_scale[i][1])
Exemplo n.º 4
0
 def test_textfiles(self, path_to_a, path_to_b, path_to_bias, post_scale):        
   mat_A = np.loadtxt(path_to_a, dtype=np.int16)
   mat_B = np.loadtxt(path_to_b, dtype=np.int16)
   bias = np.loadtxt(path_to_bias, dtype=np.int32)
   m = mat_A.shape[0]
   n = mat_B.shape[1]
   C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
   gemx.sendMat(mat_A)
   gemx.sendMat(mat_B)
   gemx.sendMat(C_fpga)    
   gemx.sendMat(bias)
   gemx.addGEMMOp (mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
   gemx.execute()
   gemx.clearInstrBuf()
   gemx.getMat(C_fpga)  
   self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 5
0
def test_multiInstrv1(int_range, m, k, n, add_bias=False):
    print("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n))
    A = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(m, k),
                          dtype=np.int16)
    B = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(k, n),
                          dtype=np.int16)
    C = np.zeros((m, n), dtype=np.int16)
    D = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(m, k),
                          dtype=np.int16)
    E = np.zeros((m, n), dtype=np.int16)
    b0 = np.zeros((m, n), dtype=np.int32)

    b1 = np.zeros((m, n), dtype=np.int32)

    if add_bias == True:
        b0 = np.random.randint(low=-int_range,
                               high=int_range,
                               size=(m, n),
                               dtype=np.int32)
        b1 = np.random.randint(low=-int_range,
                               high=int_range,
                               size=(m, n),
                               dtype=np.int32)
    gemx.sendMat(A)
    gemx.sendMat(B)
    gemx.sendMat(b0)
    gemx.sendMat(C)
    gemx.sendMat(D)
    gemx.sendMat(E)
    gemx.sendMat(b1)
    gemx.addGEMMOp(A, B, C, b0, 1, 0)
    gemx.addGEMMOp(D, C, E, b1, 1, 0)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C)
    gemx.getMat(E)
    print("test C")
    test.multiply_and_cmp(C, A, B, b0, m, n, [1, 0])
    print("test E")
    test.multiply_and_cmp(E, D, C, b1, m, n, [1, 0])
Exemplo n.º 6
0
 def test_basic(self,PE, mat_A, mat_B, bias, post_scale = [1,1]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print ("test_basic(PE=%d): %d %d %d %d %d" % (PE,m, k, n, post_scale[0], post_scale[1] )) 
     print ("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print ("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print ("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     C_fpga = np.zeros( (m, n), dtype=np.int16)
     gemx.sendMat(mat_A,PE)
     gemx.sendMat(mat_B,PE)
     gemx.sendMat(C_fpga,PE)    
     gemx.sendMat(bias, PE)
     gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], PE) # default test_basic will call addGEMMOp
     gemx.execute(PE)
     gemx.getMat(C_fpga,PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 7
0
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range,
                         post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A = []
    mat_C = []
    mat_bias = []
    for i in range(ins_count):
        total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[
            i] * n_size[i] * 3
        total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
        mat_A.append(
            np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m_size[i], k_size[i]),
                              dtype=np.int16))
        mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32))
        mat_C.append(
            np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range,
                               high=B_range,
                               size=(k_size[0], n_size[0]),
                               dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    for i in range(ins_count):
        gemx.sendMat(mat_A[i])
        gemx.sendMat(mat_C[i])
        gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addGEMMOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0],
                   post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(mat_C[0])
    gemx.getMat(mat_C[1])
    gemx.getMat(mat_C[2])
    gemx.getMat(mat_C[3])
    timePointKernel.append(time.time())  # copy from FPGA
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, 0, 0, 0)
    if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max(
            n_size) > 4096:
        print("Skip golden comparision because large matrix size")
    else:
        test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3],
                              m_size[3], n_size[3], post_scale)
Exemplo n.º 8
0
 def predict ( self, inp, in_scale, post_scale):
     row_padded, col_padded = self.get_padded_shape( inp.shape, self.min_m, self.min_k)
     padded_arr = np.zeros ( (row_padded, col_padded), dtype=inp.dtype, order='C')
     padded_arr[0:inp.shape[0], 0:inp.shape[1]] = inp
     
     print ("input shape", padded_arr.shape)
     np.copyto(self.fpga_buf[0], np.int16( padded_arr * in_scale ), casting='same_kind', where=True)
     gemx.sendMat(self.fpga_buf[0])
     for i,l in enumerate(self.kmodel.layers):
         act = l.get_config()['activation']
         if act == 'relu':
             gemx.addFCNOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1], 0, 0)
         else:
             gemx.addGEMMOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1])
              
     gemx.execute()
     gemx.getMat (self.fpga_buf[-1])
     return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]    
Exemplo n.º 9
0
def test_perf_gemm(m,
                   k,
                   n,
                   A_range=32764,
                   B_range=32764,
                   bias_range=32764,
                   post_scale=[1, 0]):
    mat_A = np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m, k),
                              dtype=np.int16)
    mat_B = np.random.randint(low=-B_range,
                              high=B_range,
                              size=(k, n),
                              dtype=np.int16)
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range,
                                 high=bias_range,
                                 size=(m, n),
                                 dtype=np.int32)
    else:
        bias = np.zeros((m, n), dtype=np.int32, order='C')
    C_fpga = np.zeros((m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)
    gemx.sendMat(bias)
    gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    gemx.clearInstrBuf()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(C_fpga)
    timePointKernel.append(time.time())  # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, m, k, n)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 10
0
def test_perf_gemm_gemm(A_range, B_range, bias_range, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m, k),
                              dtype=np.int16)
    mat_B = np.random.randint(low=-B_range,
                              high=B_range,
                              size=(k, n),
                              dtype=np.int16)
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range,
                                 high=bias_range,
                                 size=(m, n),
                                 dtype=np.int32)
    else:
        bias = np.zeros((m, n), dtype=np.int32, order='C')
    C_fpga = np.zeros((m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)
    gemx.sendMat(bias)
    gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(C_fpga)
    timePointKernel.append(time.time())  # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, m, k, n)
    if m > 4096 and n > 4096 and k > 4096:
        print("Skip golden comparision because large matrix size")
    else:
        test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
Exemplo n.º 11
0
 def loadInstr(self):
     gemx.clearInstrBuf()
     for i, (w_i, b_i) in enumerate(zip(self._qw, self._qb)):
         gemx.addGEMMOp(w_i, self.fpga_buf[i], self.fpga_buf[i + 1], b_i,
                        self.post_scale[i][0], self.post_scale[i][1])