예제 #1
0
    def __init__(self, xclbin_opt, wgt, bias, wgt_scale, post_scale):

        #Ensuring min_m and min_n never fall below min_k is needed when chaining multiple GEMM operations
        #If min_m/min_n is less than min_k, using the output of a GEMM call where either dimension
        #is less than min_k would lead to bad results if it's directly used as input for another GEMM operation
        self.min_m = 32 * max(int(xclbin_opt["GEMX_gemmKBlocks"]),
                              int(xclbin_opt["GEMX_gemmMBlocks"]))
        self.min_k = 32 * int(xclbin_opt["GEMX_gemmKBlocks"])
        self.min_n = 32 * max(int(xclbin_opt["GEMX_gemmKBlocks"]),
                              int(xclbin_opt["GEMX_gemmNBlocks"]))
        if type(wgt) != list:
            wgt = [wgt]

        if type(bias) != list:
            bias = [bias]

        self._wshape = []
        for w in wgt:
            self._wshape.append(w.shape)

        self._qw = [np.int16(a * b) for a, b in zip(wgt, wgt_scale)]
        self._qb = [np.int32(a * b) for a, b in zip(bias, wgt_scale)]
        for i, b in enumerate(self._qw):
            self._qw[i] = self.format_for_fpga(b, self.min_k, self.min_n)
            gemx.sendMat(self._qw[i])

        #in_row, in_col = self.get_padded_shape(in_dim, self.min_m, self.min_k)
        self.fpga_buf = []
        self.out_dim = None
        self.post_scale = post_scale
        self.batch_sz = 0
예제 #2
0
    def format_bias(self, b, dim, min_row, min_col):
        if b.ndim == 1:
            b = np.broadcast_to(b, dim)

        b = self.format_for_fpga(b, min_row, min_col)
        gemx.sendMat(b)
        return b
예제 #3
0
def common_uspmv(rows,cols,datas,m_sizes,k_sizes,nnz_sizes, num_runs,vector_range):
  ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
  min_k = ddrWidth
  min_m = ddrWidth * int(xclbin_opts["GEMX_uspmvInterleaves"]) 
  for i in range(len(m_sizes)):
     m_sizes[i] = test.get_padded_size (m_sizes[i], min_m)
     k_sizes[i] = test.get_padded_size (k_sizes[i], min_m)
  print ("size:",m_sizes,k_sizes,"nnz:",nnz_sizes) 
  B = np.zeros((num_runs, k_sizes[i]), dtype=np.float32)
  test.fillMod(9, num_runs, k_sizes[i],B)
  B = B.astype(np.float32)
  C_list=[B]
  for i in range(len(m_sizes)):
    C = np.zeros ((num_runs, m_sizes[i]), dtype=np.float32)
    C_list.append(C)
    A = gemx.sendUSpMat(np.array(rows[i]).astype(np.uint16),
                        np.array(cols[i]).astype(np.uint16),
                        np.array(datas[i]),
                        np.array(m_sizes[i],dtype=np.int32),
                        np.array(k_sizes[i],dtype=np.int32),
                        np.array(nnz_sizes[i],dtype=np.int32),
                        np.array(1,dtype=np.float32),
                        xclbin_opts)  
    gemx.sendMat(C_list[i])
    gemx.sendMat(C_list[i+1])
    gemx.addUSPMVOp(A,C_list[i],C_list[i+1],num_runs)
  gemx.execute()
  gemx.clearInstrBuf()
  gemx.getMat(C_list[-1])
  test.multiply_and_cmp_uspmv(rows,cols,datas,m_sizes,k_sizes,B,C_list[-1])
예제 #4
0
def common_spmv(row, col, data, m, k, nnz, vector_range):
    if xclbin_opts["GEMX_dataType"] == "float":
        data_type = np.float32
    elif xclbin_opts["GEMX_dataType"] == "int32_t":
        data_type = np.int32
    else:
        raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported")
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    min_k = ddrWidth
    spmvWidth = int(xclbin_opts["GEMX_spmvWidth"])
    min_m = spmvWidth * int(xclbin_opts["GEMX_spmvMacGroups"])
    m = get_padded_size(m, min_m)
    k = get_padded_size(k, min_k)
    print("size:", m, k, "nnz:", nnz)
    if data_type == np.int32:
        B = np.random.randint(low=-vector_range,
                              high=vector_range,
                              size=(k, 1),
                              dtype=np.int32)
    else:
        B = np.zeros((k, 1), dtype=np.float32)
        test.fillMod(B, k, vector_range)
    C = np.zeros((m, 1), dtype=data_type)
    A = gemx.sendSpMat(row, col, data, m, k, nnz, xclbin_opts)
    gemx.sendMat(B)
    gemx.sendMat(C)
    gemx.addSPMVOp(A, B, C, nnz, xclbin_opts)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C)
    test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
예제 #5
0
    def format_bias(self, b, dim, min_row, min_col):
        if b.ndim == 1:
            b = np.broadcast_to(b, (dim[1], dim[0]))

        b = np.transpose(b)
        b = self.format_for_fpga(b, min_row, min_col)
        gemx.sendMat(b)
        return b
예제 #6
0
 def predict ( self, inp):
   self.out_dim = (inp.shape[0],self.out_dim[1])
   inp = self.format_for_fpga(inp, 1, 1)
   B = inp.astype(np.float32)
   gemx.sendMat(B)            
   C = np.zeros ((inp.shape[0], self.sizes[0][0]), dtype=np.float32)
   gemx.sendMat(C)
   gemx.addUSPMVOp(self.A_list[0],B,C,inp.shape[0])  
   gemx.execute()
   gemx.getMat(C)
   gemx.clearInstrBuf()
   result = C        
   return result[:self.out_dim[0],:self.out_dim[1]]
예제 #7
0
 def test_basic(self,
                PE,
                mat_A,
                mat_B,
                bias,
                post_scale=[1, 1],
                RELU_scale=[1, 0]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print("test Fcn")
     print("test_basic: %d %d %d %d %d" %
           (m, k, n, post_scale[0], post_scale[1]))
     print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
     gemx.sendMat(mat_A, PE)
     gemx.sendMat(mat_B, PE)
     gemx.sendMat(C_fpga, PE)
     gemx.sendMat(bias, PE)
     gemx.addFCNOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1],
                   RELU_scale[0], RELU_scale[1], PE)
     gemx.execute(PE)
     gemx.clearInstrBuf(PE)
     gemx.getMat(C_fpga, PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale,
                           RELU_scale)
예제 #8
0
def test_perf_fcn(A_range, B_range, bias_range, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time()) # current time    
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)    
    gemx.sendMat(bias)
    gemx.addFCNOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1],1,0)
    timePointKernel.append(time.time()) # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time()) # call kernel
    gemx.getMat(C_fpga)  
    timePointKernel.append(time.time()) # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = gemx.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,m,k,n)
    if m > 4096 and n > 4096 and k > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #9
0
def test_perf_gemm(m, k, n, xclbin_opts, post_scale=[1,0], A_range=32764, B_range=32764, bias_range=32764):
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    m = test.get_padded_size(m, int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth)
    k = test.get_padded_size(k, int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth)
    n = test.get_padded_size(n, int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth)
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)  
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ((m, n), dtype=np.int32, order='C');   
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    start_time = time.time()  
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)    
    gemx.sendMat(bias)
    gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C_fpga)  
    end_time = time.time()
    total_operations = 2 * m * n * k + m * n * 3
    test.test_perf(end_time-start_time,total_operations,m,k,n,ddrWidth)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #10
0
파일: test.py 프로젝트: liujieuw/gemx
 def test_basic(self,
                PE,
                xclbin_opts,
                mat_A,
                mat_B,
                bias,
                post_scale=[1, 0]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print("test_basic(PE=%d): %d %d %d %d %d" %
           (PE, m, k, n, post_scale[0], post_scale[1]))
     print("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     if xclbin_opts["GEMX_dataType"] == "short":
         C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
     else:  #float
         C_fpga = np.zeros((m, n), dtype=np.float32, order='C')
     gemx.sendMat(mat_A, PE)
     gemx.sendMat(mat_B, PE)
     gemx.sendMat(C_fpga, PE)
     gemx.sendMat(bias, PE)
     gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0],
                    post_scale[1],
                    PE)  # default test_basic will call addGEMMOp
     gemx.execute(PE)
     gemx.clearInstrBuf(PE)
     gemx.getMat(C_fpga, PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #11
0
def common_spmv(row, col, data, m, k, nnz, vector_range):
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    if xclbin_opts["GEMX_dataType"] == "float":
        dtype = np.float32
    elif xclbin_opts["GEMX_dataType"] == "int32_t":
        dtype = np.int32
    else:
        raise TypeError("type", xclbin_opts["GEMX_dataType"], "not supported")
    if dtype == np.int32:
        B = np.random.randint(low=-vector_range,
                              high=vector_range,
                              size=(k, 1),
                              dtype=np.int32)
        C = np.zeros((m, 1), dtype=np.int32)
        A = gemx.sendSpMat(row, col, data, ddrWidth, dtype)
        gemx.sendMat(B)
        gemx.sendMat(C)
        gemx.addSPMVOp(A, B, C, nnz)
        gemx.execute()
        gemx.clearInstrBuf()
        gemx.getMat(C)
        test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
    elif dtype == np.float32:
        B = np.zeros((k, 1), dtype=np.float32)
        test.fillMod(B, k, vector_range)
        C = np.zeros((m, 1), dtype=np.float32)
        A = gemx.sendSpMat(row, col, data, ddrWidth, dtype)
        gemx.sendMat(B)
        gemx.sendMat(C)
        gemx.addSPMVOp(A, B, C, nnz)
        gemx.execute()
        gemx.clearInstrBuf()
        gemx.getMat(C)
        test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
예제 #12
0
def common_spmv(row, col, data, m, k, nnz, vector_range, dtype):
    if dtype == np.int32:
        B = np.random.randint(low=-vector_range,
                              high=vector_range,
                              size=(k, 1),
                              dtype=np.int32)
        C = np.zeros((m, 1), dtype=np.int32)
        A = gemx.sendSpMat(row, col, data, nnz, dtype)
        gemx.sendMat(B)
        gemx.sendMat(C)
        gemx.addSPMVOp(A, B, C, nnz)
        gemx.execute()
        gemx.getMat(C)
        test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
    elif dtype == np.float32:
        B = np.zeros((k, 1), dtype=np.float32)
        test.fillMod(B, k, vector_range)
        C = np.zeros((m, 1), dtype=np.float32)
        A = gemx.sendSpMat(row, col, data, nnz, dtype)
        gemx.sendMat(B)
        gemx.sendMat(C)
        gemx.addSPMVOp(A, B, C, nnz)
        gemx.execute()
        gemx.getMat(C)
        test.multiply_and_cmp_spmv(row, col, data, m, k, nnz, B, C)
    else:
        raise TypeError("type", dtype, "not supported")
예제 #13
0
    def predict(self, inp, in_scale):
        self.init_fpgabuf(inp.shape)
        self.loadInstr()

        padded_arr = self.format_for_fpga(inp * in_scale, self.min_m,
                                          self.min_k)

        #print ("input shape", padded_arr.shape)
        np.copyto(self.fpga_buf[0],
                  np.int16(padded_arr),
                  casting='same_kind',
                  where=True)
        gemx.sendMat(self.fpga_buf[0])
        gemx.execute()
        gemx.getMat(self.fpga_buf[-1])
        return self.fpga_buf[-1][:self.out_dim[0], :self.out_dim[1]]
예제 #14
0
def test_perf_multi_gemm(ins_count, m_size, k_size, n_size, A_range, B_range,
                         post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A = []
    mat_C = []
    mat_bias = []
    for i in range(ins_count):
        total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[
            i] * n_size[i] * 3
        total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
        mat_A.append(
            np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m_size[i], k_size[i]),
                              dtype=np.int16))
        mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32))
        mat_C.append(
            np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range,
                               high=B_range,
                               size=(k_size[0], n_size[0]),
                               dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    for i in range(ins_count):
        gemx.sendMat(mat_A[i])
        gemx.sendMat(mat_C[i])
        gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addGEMMOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0],
                   post_scale[1])
    gemx.addGEMMOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0],
                   post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(mat_C[0])
    gemx.getMat(mat_C[1])
    gemx.getMat(mat_C[2])
    gemx.getMat(mat_C[3])
    timePointKernel.append(time.time())  # copy from FPGA
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, 0, 0, 0)
    if np.max(m_size) > 4096 and np.max(k_size) > 4096 and np.max(
            n_size) > 4096:
        print("Skip golden comparision because large matrix size")
    else:
        test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3],
                              m_size[3], n_size[3], post_scale)
예제 #15
0
 def predict ( self, inp, in_scale, post_scale):
     row_padded, col_padded = self.get_padded_shape( inp.shape, self.min_m, self.min_k)
     padded_arr = np.zeros ( (row_padded, col_padded), dtype=inp.dtype, order='C')
     padded_arr[0:inp.shape[0], 0:inp.shape[1]] = inp
     
     print ("input shape", padded_arr.shape)
     np.copyto(self.fpga_buf[0], np.int16( padded_arr * in_scale ), casting='same_kind', where=True)
     gemx.sendMat(self.fpga_buf[0])
     for i,l in enumerate(self.kmodel.layers):
         act = l.get_config()['activation']
         if act == 'relu':
             gemx.addFCNOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1], 0, 0)
         else:
             gemx.addGEMMOp( self.fpga_buf[i], self.w[i], self.fpga_buf[i+1], self.b[i], post_scale[i][0], post_scale[i][1])
              
     gemx.execute()
     gemx.getMat (self.fpga_buf[-1])
     return self.fpga_buf[-1][:self.out_dim[0],:self.out_dim[1]]    
예제 #16
0
def test_multi_fcn(ins_count,
                   m_size,
                   k_size,
                   n_size,
                   post_scale=[1, 0],
                   A_range=32764,
                   B_range=32764):
    mat_A = []
    mat_C = []
    mat_bias = []
    ddrWidth = int(xclbin_opts["GEMX_ddrWidth"])
    for i in range(ins_count):
        m_size[i] = test.get_padded_size(
            m_size[i],
            int(xclbin_opts["GEMX_gemmMBlocks"]) * ddrWidth)
        k_size[i] = test.get_padded_size(
            k_size[i],
            int(xclbin_opts["GEMX_gemmKBlocks"]) * ddrWidth)
        n_size[i] = test.get_padded_size(
            n_size[i],
            int(xclbin_opts["GEMX_gemmNBlocks"]) * ddrWidth)
        mat_A.append(
            np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m_size[i], k_size[i]),
                              dtype=np.int16))
        mat_bias.append(np.zeros((m_size[i], n_size[i]), dtype=np.int32))
        mat_C.append(
            np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range,
                               high=B_range,
                               size=(k_size[0], n_size[0]),
                               dtype=np.int16)
    for i in range(ins_count):
        gemx.sendMat(mat_A[i])
        gemx.sendMat(mat_C[i])
        gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addFCNOp(mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.addFCNOp(mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0],
                  post_scale[1], 1, 0)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(mat_C[0])
    gemx.getMat(mat_C[1])
    gemx.getMat(mat_C[2])
    gemx.getMat(mat_C[3])
    test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3],
                          n_size[3], post_scale)
예제 #17
0
    def __init__(self, xclbin_opts, wgt, bias, wgt_scale, bias_scale,
                 post_scale, relu_scale):

        #Ensuring min_m and min_n never fall below min_k is needed when chaining multiple GEMM operations
        #If min_m/min_n is less than min_k, using the output of a GEMM call where either dimension
        #is less than min_k would lead to bad results if it's directly used as input for another GEMM operation
        ddrwidth = int(xclbin_opts["GEMX_ddrWidth"])
        self.min_m = ddrwidth * max(int(xclbin_opts["GEMX_gemmKBlocks"]),
                                    int(xclbin_opts["GEMX_gemmMBlocks"]))
        self.min_k = ddrwidth * int(xclbin_opts["GEMX_gemmKBlocks"])
        self.min_n = ddrwidth * int(xclbin_opts["GEMX_gemmNBlocks"])
        if type(wgt) != list:
            wgt = [wgt]

        if type(bias) != list:
            bias = [bias]

        assert len(wgt) == len(wgt_scale)
        assert len(bias) == len(bias_scale)

        self._wshape = []
        for w in wgt:
            self._wshape.append(w.shape)
        if xclbin_opts["GEMX_dataType"] == "float":
            self._qw = wgt
            self._qb = bias
        else:
            self._qw = [
                np.int16(np.around(a * b)) for a, b in zip(wgt, wgt_scale)
            ]
            self._qb = [
                np.int32(np.around(a * b)) for a, b in zip(bias, bias_scale)
            ]
        for i, b in enumerate(self._qw):
            b = np.transpose(b)
            self._qw[i] = self.format_for_fpga(b, self.min_m, self.min_k)
            gemx.sendMat(self._qw[i])

        #in_row, in_col = self.get_padded_shape(in_dim, self.min_m, self.min_k)
        self.fpga_buf = []
        self.out_dim = None
        self.post_scale = post_scale
        self.relu_scale = relu_scale
        self.batch_sz = 0
예제 #18
0
    def predict(self, inp, in_scale):
        """
      prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix
      
      Parameters
      ---------- 
      inp:      array
                input matrix
      in_scale: float   
                input scale
      Return
      ------
      array
               result prediction matrix
      
      """
        inp = np.transpose(inp)
        self.init_fpgabuf(inp.shape)
        self.loadInstr()

        padded_arr = self.format_for_fpga(inp * in_scale, self.min_k,
                                          self.min_n)

        #print ("input shape", padded_arr.shape)
        if self.fpga_buf[0].dtype == np.int16:
            np.copyto(self.fpga_buf[0],
                      np.int16(padded_arr),
                      casting='same_kind',
                      where=True)
        else:
            np.copyto(self.fpga_buf[0],
                      padded_arr,
                      casting='same_kind',
                      where=True)
        gemx.sendMat(self.fpga_buf[0])
        gemx.execute()
        gemx.getMat(self.fpga_buf[-1])
        return np.transpose(
            self.fpga_buf[-1][:self.out_dim[0], :self.out_dim[1]])
예제 #19
0
파일: test.py 프로젝트: saadmahboob/gemx
 def test_textfiles(self, path_to_a, path_to_b, path_to_bias,post_scale):        
   mat_A = np.loadtxt(path_to_a, dtype=np.int16)
   mat_B = np.loadtxt(path_to_b, dtype=np.int16)
   bias = np.loadtxt(path_to_bias, dtype=np.int32)
   m = mat_A.shape[0]
   n = mat_B.shape[1]
   C_fpga = np.zeros((m, n), dtype=np.int16, order='C')
   gemx.sendMat(mat_A)
   gemx.sendMat(mat_B)
   gemx.sendMat(C_fpga)    
   gemx.sendMat(bias)
   gemx.addFCNOp (mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], 1, 0)
   gemx.execute()
   gemx.clearInstrBuf()
   gemx.getMat(C_fpga)  
   self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #20
0
 def predict(self, inp, in_scale):
     C_list = [[]] * (len(self.kmodel.layers) + 1)
     inp = self.format_for_fpga(inp, self.min_m, self.min_m)
     C_list[0] = np.transpose(inp * in_scale)
     for i, bi in enumerate(self._qb):
         bi = bi.reshape(bi.shape[0], 1)
         bi = self.format_for_fpga(bi, C_list[i].shape[1], self.sizes[i][0])
         for j in range(C_list[i].shape[1]):
             B = (C_list[i][:, j]).astype(np.float32)
             C = np.zeros((self.sizes[i][0], 1), dtype=np.float32)
             gemx.sendMat(B)
             gemx.sendMat(C)
             gemx.addSPMVOp(self.A_list[i], B, C, self.sizes[i][2])
             gemx.execute()
             gemx.clearInstrBuf()
             gemx.getMat(C)
             if j == 0:
                 C_list[i + 1] = C
             else:
                 C_list[i + 1] = np.append(C_list[i + 1], C, axis=1)
         C_list[i + 1] = C_list[i + 1] + np.transpose(bi)
     result = np.transpose(C_list[-1])
     return result[:self.out_dim[0], :self.out_dim[1]]
예제 #21
0
 def test_basic(self,PE, mat_A, mat_B, bias, post_scale = [1,1]):
     m = mat_A.shape[0]
     k = mat_A.shape[1]
     n = mat_B.shape[1]
     print ("test_basic(PE=%d): %d %d %d %d %d" % (PE,m, k, n, post_scale[0], post_scale[1] )) 
     print ("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
     print ("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
     print ("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
     C_fpga = np.zeros( (m, n), dtype=np.int16)
     gemx.sendMat(mat_A,PE)
     gemx.sendMat(mat_B,PE)
     gemx.sendMat(C_fpga,PE)    
     gemx.sendMat(bias, PE)
     gemx.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1], PE) # default test_basic will call addGEMMOp
     gemx.execute(PE)
     gemx.getMat(C_fpga,PE)
     self.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #22
0
def test_perf_gemm(m,
                   k,
                   n,
                   A_range=32764,
                   B_range=32764,
                   bias_range=32764,
                   post_scale=[1, 0]):
    mat_A = np.random.randint(low=-A_range,
                              high=A_range,
                              size=(m, k),
                              dtype=np.int16)
    mat_B = np.random.randint(low=-B_range,
                              high=B_range,
                              size=(k, n),
                              dtype=np.int16)
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range,
                                 high=bias_range,
                                 size=(m, n),
                                 dtype=np.int32)
    else:
        bias = np.zeros((m, n), dtype=np.int32, order='C')
    C_fpga = np.zeros((m, n), dtype=np.int16)
    timePointKernel = []
    timePointKernel.append(time.time())  # current time
    gemx.sendMat(mat_A)
    gemx.sendMat(mat_B)
    gemx.sendMat(C_fpga)
    gemx.sendMat(bias)
    gemx.addGEMMOp(mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    timePointKernel.append(time.time())  # send to FPGA
    gemx.execute()
    gemx.clearInstrBuf()
    timePointKernel.append(time.time())  # call kernel
    gemx.getMat(C_fpga)
    timePointKernel.append(time.time())  # copy from FPGA
    total_operations = 2 * m * n * k + m * n * 3
    total_parallel_operations = 2 * m * n * k
    freq = gemx.getFreq()
    test.test_perf(timePointKernel, total_operations,
                   total_parallel_operations, freq, m, k, n)
    test.multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
예제 #23
0
 def predict(self, inp):
     """
   prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix
   
   Parameters
   ---------- 
   inp:      array
             input matrix
             
   Return
   ------
   array
            result prediction matrix
   
   """
     C_list = [[]] * 2
     inp = self.format_for_fpga(inp, 1, 1)
     C_list[0] = np.transpose(inp)
     B = (C_list[0][:, 0]).astype(np.float32)
     C_vector = [B]
     gemx.sendMat(C_vector[0])
     for i, l in enumerate(self.kmodel.layers):
         C_vector.append(np.zeros((self.sizes[i][0], 1), dtype=np.float32))
         gemx.sendMat(C_vector[i + 1])
         activation = True if l.get_config(
         )['activation'] == 'relu' else False
         gemx.addSPMVOp(self.A_list[i], C_vector[i], C_vector[i + 1],
                        self.sizes[i][2], self.xclbin_opts, activation)
     gemx.execute()
     gemx.getMat(C_vector[-1])
     C_list[1] = C_vector[-1]
     for j in range(1, C_list[0].shape[1]):
         C_vector[0][:] = (C_list[0][:, j]).astype(np.float32)
         gemx.sendMat(C_vector[0])
         C_vector[-1].fill(0)
         for i in range(len(self.kmodel.layers)):
             gemx.sendMat(C_vector[i + 1])
         gemx.execute()
         gemx.getMat(C_vector[-1])
         C_list[1] = np.append(C_list[1], C_vector[-1], axis=1)
     gemx.clearInstrBuf()
     result = np.transpose(C_list[1])
     return result[:self.out_dim[0], :self.out_dim[1]]
예제 #24
0
 def predict(self, inp):
     """
   prepare input matrix for the engine, send all the matrices and instructions to kernel and get the result prediction matrix
   
   Parameters
   ---------- 
   inp:      array
             input matrix
             
   Return
   ------
   array
            result prediction matrix
   
   """
     stage_size = int(self.xclbin_opts["GEMX_uspmvStages"])
     layer_size = len(self._qw)
     if stage_size == 1:
         inp = self.format_for_fpga(inp, 1, self.min_m)
         C_list = [inp.astype(np.float32)]
         gemx.sendMat(C_list[0])
         for i in range(layer_size):
             C_list.append(
                 np.zeros((inp.shape[0], self.sizes[i][0]),
                          dtype=np.float32))
             gemx.sendMat(C_list[i + 1])
             gemx.addUSPMVOp(self.A_list[i], C_list[i], C_list[i + 1],
                             inp.shape[0])
     else:
         inp = self.format_for_fpga(inp, 1, self.min_m)
         C_list = [inp.astype(np.float32)]
         gemx.sendMat(C_list[0])
         C_end = np.zeros((inp.shape[0], self.sizes[-1][0]),
                          dtype=np.float32)
         gemx.sendMat(C_end)
         gemx.addUSPMVOp(self.A_list[0], C_list[0], C_list[-1],
                         inp.shape[0])
     gemx.execute()
     gemx.getMat(C_list[-1])
     gemx.clearInstrBuf()
     result = C_list[-1]
     return result[:self.out_dim[0], :self.out_dim[1]]
예제 #25
0
def test_perf_multi_fcn(ins_count, m_size, k_size, n_size, A_range, B_range, post_scale):
    total_operations = 0
    total_parallel_operations = 0
    mat_A=[]
    mat_C=[]
    mat_bias=[]
    for i in range(ins_count):
      total_operations += 2 * m_size[i] * n_size[i] * k_size[i] + m_size[i] * n_size[i] * 3
      total_parallel_operations += 2 * m_size[i] * n_size[i] * k_size[i]
      mat_A.append(np.random.randint(low=-A_range, high=A_range, size=(m_size[i], k_size[i]), dtype=np.int16))
      mat_bias.append(np.zeros ((m_size[i], n_size[i]), dtype=np.int32))
      mat_C.append(np.zeros((m_size[i], n_size[i]), dtype=np.int16, order='C'))
    mat_B0 = np.random.randint(low=-B_range, high=B_range, size=(k_size[0], n_size[0]), dtype=np.int16) 
    timePointKernel = []
    timePointKernel.append(time.time()) # current time 
    for i in range(ins_count):
      gemx.sendMat(mat_A[i])
      gemx.sendMat(mat_C[i])
      gemx.sendMat(mat_bias[i])
    gemx.sendMat(mat_B0)
    gemx.addFCNOp (mat_A[0], mat_B0, mat_C[0], mat_bias[0], post_scale[0], post_scale[1],1,0)    
    gemx.addFCNOp (mat_A[1], mat_C[0], mat_C[1], mat_bias[1], post_scale[0], post_scale[1],1,0) 
    gemx.addFCNOp (mat_A[2], mat_C[1], mat_C[2], mat_bias[2], post_scale[0], post_scale[1],1,0) 
    gemx.addFCNOp (mat_A[3], mat_C[2], mat_C[3], mat_bias[3], post_scale[0], post_scale[1],1,0)
    timePointKernel.append(time.time()) # send to FPGA
    gemx.execute()
    gemx.clearInstrBuf()
    timePointKernel.append(time.time()) # call kernel
    gemx.getMat(mat_C[0])  
    gemx.getMat(mat_C[1]) 
    gemx.getMat(mat_C[2]) 
    gemx.getMat(mat_C[3]) 
    timePointKernel.append(time.time()) # copy from FPGA
    freq = gemx.getFreq()
    test.test_perf(timePointKernel,total_operations,total_parallel_operations,freq,0,0,0)
    test.multiply_and_cmp(mat_C[3], mat_A[3], mat_C[2], mat_bias[3], m_size[3], n_size[3], post_scale)
예제 #26
0
def test_multiInstrv1(int_range, m, k, n, add_bias=False):
    print("test_multiInstrv1: %d %d %d %d" % (int_range, m, k, n))
    A = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(m, k),
                          dtype=np.int16)
    B = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(k, n),
                          dtype=np.int16)
    C = np.zeros((m, n), dtype=np.int16)
    D = np.random.randint(low=-int_range,
                          high=int_range,
                          size=(m, k),
                          dtype=np.int16)
    E = np.zeros((m, n), dtype=np.int16)
    b0 = np.zeros((m, n), dtype=np.int32)

    b1 = np.zeros((m, n), dtype=np.int32)

    if add_bias == True:
        b0 = np.random.randint(low=-int_range,
                               high=int_range,
                               size=(m, n),
                               dtype=np.int32)
        b1 = np.random.randint(low=-int_range,
                               high=int_range,
                               size=(m, n),
                               dtype=np.int32)
    gemx.sendMat(A)
    gemx.sendMat(B)
    gemx.sendMat(b0)
    gemx.sendMat(C)
    gemx.sendMat(D)
    gemx.sendMat(E)
    gemx.sendMat(b1)
    gemx.addGEMMOp(A, B, C, b0, 1, 0)
    gemx.addGEMMOp(D, C, E, b1, 1, 0)
    gemx.execute()
    gemx.clearInstrBuf()
    gemx.getMat(C)
    gemx.getMat(E)
    print("test C")
    test.multiply_and_cmp(C, A, B, b0, m, n, [1, 0])
    print("test E")
    test.multiply_and_cmp(E, D, C, b1, m, n, [1, 0])
예제 #27
0
     int(xclbin_opt["GEMX_gemmKBlocks"]), int(
         xclbin_opt["GEMX_gemmNBlocks"]))
 if args.mtx == 'none':
     num_matrix = len(args.matrix) / 3
     for i in range(num_matrix):
         m = args.matrix[i * 3]
         k = args.matrix[i * 3 + 1]
         n = args.matrix[i * 3 + 2]
         m = int(math.ceil(np.float32(m) / min_m) * min_m)
         k = int(math.ceil(np.float32(k) / min_k) * min_k)
         n = int(math.ceil(np.float32(n) / min_n) * min_n)
         print(m, k, n)
         A = np.zeros((m, k), dtype=np.int16)
         A.fill(1)
         A_buf.append(A)
         gemx.sendMat(A_buf[i])
         B_buf.append(np.zeros((k, n), dtype=np.int16))
         C_buf.append(np.zeros((m, n), dtype=np.int16))
         bias = np.zeros((m, n), dtype=np.int32)
         bias.fill(1)
         bias_buf.append(bias)
 else:
     #For fcn if read from mtx files, matrix sizes still need to be provided
     num_matrix = len(args.mtx)
     if num_matrix != len(args.matrix) / 3:
         raise Exception("please enter sizes for each layer")
     for i in range(num_matrix):
         matA = sio.mmread(args.mtx[i])
         m = args.matrix[i * 3]
         k = args.matrix[i * 3 + 1]
         n = args.matrix[i * 3 + 2]