def test_csr2sertilp(self): mat = np.array([ [1,0,2,0,3,0], [4,0,5,0,0,0], [0,0,0,6,7,0], [0,0,0,0,0,8], [21,0,22,0,23,0], [24,0,25,0,0,0], [0,0,0,26,27,0], [0,0,0,0,0,28] ]) sp_mat = sp.csr_matrix(mat) row_len_right = np.array([1,1,1,1,1,1,1,1]) sl_start_right = np.array([0,16,32]) val_right = np.array([1.0,2.0,4.0,5.0,6.0,7.0,8.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,22.0,24.0,25.0,26.0,27.0,28.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]) #collumns taken directly from dataset, col_vs_right = np.array([1,3,1,3,4,5,6,0,5,0,0,0,0,0,0,0,1,3,1,3,4,5,6,0,5,0,0,0,0,0,0,0]) #but in sparse format collumns start from 0 so we have to substract 1 col_right = col_vs_right-1 col_right[col_right==-1]=0 val,col,row_len,sl_start=spf.csr2sertilp(sp_mat, threadsPerRow=2, prefetch=2, sliceSize=4, minAlign=2*4) self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal') self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal') self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal') self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
def test_csr2sertilp_class_smaller_than_slice_size(self): ''' The function tests the method of creation of sertilp_class representation in which each class is algin to slice_size, when class is smaller than slice_size than it is padded with 'slice_size - n' rows filled with zeros. There are 8 rows for three different classes [0,1,2], in each class the number of rows is smaller than sliceSize ''' threadsPerRow=2 prefetch=2 sliceSize=4 minAlign=2*4 mat = np.array([ [1,0,2,0,3,0], [4,0,5,0,0,0], [0,0,0,6,7,0], [0,0,0,0,0,8], [9,0,10,0,11,0], [12,0,13,0,0,0], [0,0,0,14,15,0], [0,0,0,0,0,16] ]) y = np.array([0,0,0,1,1,2,2,2]) sp_mat = sp.csr_matrix(mat) row_len_right = np.array([1,1,1,1,1,1,1,1]) sl_start_right = np.array([0,16,32, 48]) cls_slice_right = np.array([0,1,2,3]) val_right = np.array([1.0,2.0, 4.0,5.0, 6.0,7.0, 0.0,0.0, 3.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 8.0,0.0, 9.0,10.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 11.0,0.0, 0.0,0.0, 0.0,0.0, 12.0,13.0, 14.0,15.0, 16.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, ]) col_right = np.array([0,2, 0,2, 3,4, 0,0, 4,0, 0,0, 0,0, 0,0, 5,0, 0,2, 0,0, 0,0, 0,0, 4,0, 0,0, 0,0, 0,2, 3,4, 5,0, 0,0, 0,0, 0,0, 0,0, 0,0]) val,col,row_len,sl_start,cls_slice=spf.csr2sertilp_class(sp_mat,y, threadsPerRow=threadsPerRow, prefetch=prefetch, sliceSize=sliceSize, minAlign=minAlign) self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal') self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal') self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal') self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal') self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
def test_csr2sertilp_class_grather_than_slice_size(self): threadsPerRow=2 prefetch=2 sliceSize=4 minAlign=2*4 mat = np.array([ [1,0,2,0,3,0], [1,2,0,0,0,0], [1,2,3,4,0,0], [4,0,5,0,0,0], [0,0,0,6,7,0], [0,0,0,0,0,8], [9,0,10,0,11,0], [12,0,13,0,0,0], [0,0,0,14,15,0], [0,0,0,0,0,16] ]) y = np.array([0,0,0,0,0,1,1,2,2,2]) sp_mat = sp.csr_matrix(mat) row_len_right = np.array([1,1,1,1,1,1,1,1,1,1]) sl_start_right = np.array([0,16,32,48,64]) cls_slice_right = np.array([0,2,3,4]) val_right = np.array([1.0,2.0, 1.0,2.0, 1.0,2.0, 4.0,5.0, 3.0,0.0, 0.0,0.0, 3.0,4.0, 0.0,0.0, 6.0,7.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 8.0,0.0, 9.0,10.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 11.0,0.0, 0.0,0.0, 0.0,0.0, 12.0,13.0, 14.0,15.0, 16.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0, ]) col_right = np.array([0,2, 0,1, 0,1, 0,2, 4,0, 0,0, 2,3, 0,0, 3,4, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 5,0, 0,2, 0,0, 0,0, 0,0, 4,0, 0,0, 0,0, 0,2, 3,4, 5,0, 0,0, 0,0, 0,0, 0,0, 0,0]) val,col,row_len,sl_start, cls_slice=spf.csr2sertilp_class(sp_mat,y, threadsPerRow=threadsPerRow, prefetch=prefetch, sliceSize=sliceSize, minAlign=minAlign) self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal') self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal') self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal') self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal') self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
def init_cuda(self,X,Y, cls_start, max_kernels=1 ): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X =X self.Y = Y self.cls_start=cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y=[] #handle to gpu memory for results for each concurrent classifier self.g_out=[] #gpu kernel out self.kernel_out=[] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg=[] #function reference self.func=[] #texture references for each concurrent kernel self.tex_ref=[] #main vectors #gpu self.g_vecI=[] self.g_vecJ=[] #cpu self.main_vecI=[] self.main_vecJ=[] #cpu class self.cls_count=[] self.cls=[] #gpu class self.g_cls_count=[] self.g_cls=[] self.sum_cls=[] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N,self.Dim = X.shape column_size = self.N*4 cacheMB = self.cache_size*1024*1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB/column_size).astype(int) cache_items = min(self.N,cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open (self.module_file,"r") as CudaFile: module_code = CudaFile.read(); #compile module self.module = SourceModule(module_code,keep=True,no_extern_c=True) (g_gamma,gsize)=self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) ) #get functions reference Dim =self.Dim vecBytes = Dim*4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex=self.module.get_texref('VecI_TexRef') self.g_vecI[f]=cuda.mem_alloc( vecBytes) vecI_tex.set_address(self.g_vecI[f],vecBytes) #init texture for vector J vecJ_tex=self.module.get_texref('VecJ_TexRef') self.g_vecJ[f]=cuda.mem_alloc( vecBytes) vecJ_tex.set_address(self.g_vecJ[f],vecBytes) self.tex_ref.append((vecI_tex,vecJ_tex) ) self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32)) self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist) #transform X to particular format v,c,r=spf.csr2ellpack(self.X,align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start)
print 'CPU RBF takes',t1-t0, 's' kij= np.array( [ki,kj]).flatten() print 'Total sum:',kij.sum() print kij[0:1000:skip] import pycuda.driver as cuda import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule ##---------------------------------------------- # Ellpakc gpu kernel v,c,r=spf.csr2ellpack(X,align=prefetch) sd=rbf.Diag self_dot = rbf.Xsquare results = np.zeros(2*num_el,dtype=np.float32) kernel_file = "ellpackKernel.cu" with open (kernel_file,"r") as CudaFile: data = CudaFile.read(); #copy memory to device g_val = cuda.to_device(v) g_col = cuda.to_device(c) g_r = cuda.to_device(r) g_self = cuda.to_device(self_dot)
def test_csr2sertilp_class_grather_than_slice_size_unequal(self): threadsPerRow=2 prefetch=2 sliceSize=2 minAlign=threadsPerRow*sliceSize # mat = np.array([ [1,2,3,4,5,6] ]) # y = np.array([0]) # mat = np.array([ [1,0,2,0,3,0], # [1,2,0,0,0,0], # [1,2,3,4,5,6] ])#, # y = np.array([0,0,0]) mat = np.array([ [1,0,2,0,3,0], [1,2,0,0,0,0], [1,2,3,4,5,6], [6,5,4,3,2,1], [0,0,0,6,7,0], [0,0,0,0,0,8], [9,0,10,0,11,0], [12,0,13,0,0,0], [0,0,0,14,15,0], [1,2,3,4,5,17], [0,0,0,0,0,18], [0,0,0,0,0,19] ]) y = np.array([0,0,0,1,1,1,2,2,2,3,3,3]) sp_mat = sp.csr_matrix(mat) row_len_right = np.array([1,1,2,2,1,1,1,1,1,2,1,1]) sl_start_right = np.array([ 0, 8, 24, 40, 48, 56, 64, 80, 88]) cls_slice_right = np.array([0,2,4,6,8]) val_right = np.array([ 1., 2., 1., 2., 3., 0., 0., 0., 1., 2., 0., 0., 3., 4., 0., 0., 5., 6., 0., 0., 0., 0., 0., 0., 6., 5., 6., 7., 4., 3., 0., 0., 2., 1., 0., 0., 0., 0., 0., 0., 8., 0., 0., 0., 0., 0., 0., 0., 9., 10., 12., 13., 11., 0., 0., 0., 14., 15., 0., 0., 0., 0., 0., 0., 1., 2., 18., 0., 3., 4., 0., 0., 5., 17., 0., 0., 0., 0., 0., 0., 19., 0., 0., 0., 0., 0., 0., 0.]) col_right = np.array([0, 2, 0, 1, 4, 0, 0, 0, 0, 1, 0, 0, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 1, 3, 4, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 4, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]) val,col,row_len,sl_start, cls_slice=spf.csr2sertilp_class(sp_mat,y, threadsPerRow=threadsPerRow, prefetch=prefetch, sliceSize=sliceSize, minAlign=minAlign) self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal') self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal') self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal') self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal') self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
def init_cuda(self, X, Y, cls_start, max_kernels=1): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X = X self.Y = Y self.cls_start = cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y = [] #handle to gpu memory for results for each concurrent classifier self.g_out = [] #gpu kernel out self.kernel_out = [] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg = [] #function reference self.func = [] #texture references for each concurrent kernel self.tex_ref = [] #main vectors #gpu self.g_vecI = [] self.g_vecJ = [] #cpu self.main_vecI = [] self.main_vecJ = [] #cpu class self.cls_count = [] self.cls = [] #gpu class self.g_cls_count = [] self.g_cls = [] self.sum_cls = [] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N, self.Dim = X.shape column_size = self.N * 4 cacheMB = self.cache_size * 1024 * 1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB / column_size).astype(int) cache_items = min(self.N, cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open(self.module_file, "r") as CudaFile: module_code = CudaFile.read() #compile module self.module = SourceModule(module_code, keep=True, no_extern_c=True) (g_gamma, gsize) = self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma)) #get functions reference Dim = self.Dim vecBytes = Dim * 4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex = self.module.get_texref('VecI_TexRef') self.g_vecI[f] = cuda.mem_alloc(vecBytes) vecI_tex.set_address(self.g_vecI[f], vecBytes) #init texture for vector J vecJ_tex = self.module.get_texref('VecJ_TexRef') self.g_vecJ[f] = cuda.mem_alloc(vecBytes) vecJ_tex.set_address(self.g_vecJ[f], vecBytes) self.tex_ref.append((vecI_tex, vecJ_tex)) self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32)) self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist) #transform X to particular format v, c, r = spf.csr2ellpack(self.X, align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start)