Пример #1
0
    def test_csr2sertilp(self):

        mat = np.array([ [1,0,2,0,3,0], 
                         [4,0,5,0,0,0],
                         [0,0,0,6,7,0],
                         [0,0,0,0,0,8],
                         [21,0,22,0,23,0], 
                         [24,0,25,0,0,0],
                         [0,0,0,26,27,0],
                         [0,0,0,0,0,28]
                       ])
        
        sp_mat = sp.csr_matrix(mat)
        
        row_len_right = np.array([1,1,1,1,1,1,1,1])
        sl_start_right = np.array([0,16,32])
        val_right = np.array([1.0,2.0,4.0,5.0,6.0,7.0,8.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,22.0,24.0,25.0,26.0,27.0,28.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
        #collumns taken directly from dataset, 
        col_vs_right = np.array([1,3,1,3,4,5,6,0,5,0,0,0,0,0,0,0,1,3,1,3,4,5,6,0,5,0,0,0,0,0,0,0])
        #but in sparse format collumns start from 0  so we have to substract 1      
        col_right = col_vs_right-1
        col_right[col_right==-1]=0
                
        val,col,row_len,sl_start=spf.csr2sertilp(sp_mat,
                                            threadsPerRow=2, 
                                            prefetch=2,
                                            sliceSize=4,
                                            minAlign=2*4)
                                                    
        self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal')
        self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal')       
        self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal')
        self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
Пример #2
0
    def test_csr2sertilp_class_smaller_than_slice_size(self):
        '''
        The function tests the method of creation of sertilp_class representation
        in which each class is algin to slice_size, when class is smaller than slice_size
        than it is padded with 'slice_size - n' rows filled with zeros.
        
        There are 8 rows for three different classes [0,1,2], in each class
        the number of rows is smaller than sliceSize
        '''
        threadsPerRow=2
        prefetch=2
        sliceSize=4
        minAlign=2*4

        mat = np.array([ [1,0,2,0,3,0], 
                         [4,0,5,0,0,0],
                         [0,0,0,6,7,0],
                         [0,0,0,0,0,8],
                         [9,0,10,0,11,0], 
                         [12,0,13,0,0,0],
                         [0,0,0,14,15,0],
                         [0,0,0,0,0,16]
                       ])
        y = np.array([0,0,0,1,1,2,2,2])
        
        sp_mat = sp.csr_matrix(mat)
        row_len_right = np.array([1,1,1,1,1,1,1,1])        
        sl_start_right = np.array([0,16,32, 48])
        cls_slice_right = np.array([0,1,2,3])
        
        val_right = np.array([1.0,2.0, 4.0,5.0, 6.0,7.0, 0.0,0.0,
                              3.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0,
                              8.0,0.0, 9.0,10.0, 0.0,0.0, 0.0,0.0,
                              0.0,0.0, 11.0,0.0, 0.0,0.0, 0.0,0.0,
                              12.0,13.0, 14.0,15.0, 16.0,0.0, 0.0,0.0,
                              0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0,                              
                              ])
        col_right = np.array([0,2,  0,2,  3,4,  0,0,
                                 4,0,  0,0,  0,0,  0,0,
                                 5,0,  0,2,  0,0,  0,0,
                                 0,0,  4,0,  0,0,  0,0,
                                 0,2,  3,4,  5,0,  0,0, 
                                 0,0,  0,0,  0,0,  0,0])                              
        
        val,col,row_len,sl_start,cls_slice=spf.csr2sertilp_class(sp_mat,y,
                                            threadsPerRow=threadsPerRow, 
                                            prefetch=prefetch,
                                            sliceSize=sliceSize,
                                            minAlign=minAlign)
                                                    
        self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal')
        self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal')       
        self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal')       
        self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal')
        self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
Пример #3
0
    def test_csr2sertilp_class_grather_than_slice_size(self):
        
        threadsPerRow=2
        prefetch=2
        sliceSize=4
        minAlign=2*4

        mat = np.array([ [1,0,2,0,3,0], 
                         [1,2,0,0,0,0],
                         [1,2,3,4,0,0],  
                         [4,0,5,0,0,0],
                         [0,0,0,6,7,0],
                         [0,0,0,0,0,8],
                         [9,0,10,0,11,0], 
                         [12,0,13,0,0,0],
                         [0,0,0,14,15,0],
                         [0,0,0,0,0,16]
                       ])
        y = np.array([0,0,0,0,0,1,1,2,2,2])
        
        sp_mat = sp.csr_matrix(mat)
        row_len_right = np.array([1,1,1,1,1,1,1,1,1,1])        
        sl_start_right = np.array([0,16,32,48,64])
        cls_slice_right = np.array([0,2,3,4])
        
        val_right = np.array([1.0,2.0, 1.0,2.0, 1.0,2.0, 4.0,5.0, 
                              3.0,0.0, 0.0,0.0, 3.0,4.0, 0.0,0.0,
                              6.0,7.0, 0.0,0.0, 0.0,0.0, 0.0,0.0,
                              0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0,                              
                              8.0,0.0, 9.0,10.0, 0.0,0.0, 0.0,0.0,
                              0.0,0.0, 11.0,0.0, 0.0,0.0, 0.0,0.0,
                              12.0,13.0, 14.0,15.0, 16.0,0.0, 0.0,0.0,
                              0.0,0.0, 0.0,0.0, 0.0,0.0, 0.0,0.0,                              
                              ])
        col_right = np.array([0,2,  0,1,  0,1,  0,2,
                              4,0,  0,0,  2,3,  0,0,
                              3,4,  0,0,  0,0,  0,0,
                              0,0,  0,0,  0,0,  0,0,        
                              5,0,  0,2,  0,0,  0,0,
                              0,0,  4,0,  0,0,  0,0,
                              0,2,  3,4,  5,0,  0,0, 
                              0,0,  0,0,  0,0,  0,0])                              
        
        val,col,row_len,sl_start, cls_slice=spf.csr2sertilp_class(sp_mat,y,
                                            threadsPerRow=threadsPerRow, 
                                            prefetch=prefetch,
                                            sliceSize=sliceSize,
                                            minAlign=minAlign)
                                                    
        self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal')
        self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal')       
        self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal')       
        self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal')
        self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
Пример #4
0
    def init_cuda(self,X,Y, cls_start, max_kernels=1 ):
        
        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels 
        
        self.X =X
        self.Y = Y
        
        self.cls_start=cls_start.astype(np.int32)
        
        #handle to gpu memory for y for each concurrent classifier
        self.g_y=[]
        #handle to gpu memory for results for each concurrent classifier
        self.g_out=[] #gpu kernel out
        self.kernel_out=[] #cpu kernel out
        #blocks per grid for each concurrent classifier    
        self.bpg=[]
        
        #function reference
        self.func=[]
        
        #texture references for each concurrent kernel
        self.tex_ref=[]

        #main vectors 
        #gpu        
        self.g_vecI=[]
        self.g_vecJ=[]
        #cpu
        self.main_vecI=[]
        self.main_vecJ=[]    
        
        #cpu class 
        self.cls_count=[]
        self.cls=[]
        #gpu class
        self.g_cls_count=[]
        self.g_cls=[]
        
        self.sum_cls=[]
        
        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)            
#            self.func.append(0)
#            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
#            self.main_vecI.append(0)
#            self.main_vecJ.append(0)
            self.sum_cls.append(0)
            
            
        self.N,self.Dim = X.shape
        column_size = self.N*4
        cacheMB = self.cache_size*1024*1024 #100MB for cache size   
        
        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB/column_size).astype(int)
        
        cache_items = min(self.N,cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)        
        
        self.compute_diag()
        
        #cuda initialization
        cuda.init()        
        
        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code        
        with open (self.module_file,"r") as CudaFile:
            module_code = CudaFile.read();
        
        #compile module
        self.module = SourceModule(module_code,keep=True,no_extern_c=True)
        
        (g_gamma,gsize)=self.module.get_global('GAMMA')       
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) )
        
        #get functions reference

        Dim =self.Dim        
        vecBytes = Dim*4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex=self.module.get_texref('VecI_TexRef')
            self.g_vecI[f]=cuda.mem_alloc( vecBytes)           
            vecI_tex.set_address(self.g_vecI[f],vecBytes)

            #init texture for vector J
            vecJ_tex=self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f]=cuda.mem_alloc( vecBytes)     
            vecJ_tex.set_address(self.g_vecJ[f],vecBytes)
            
            self.tex_ref.append((vecI_tex,vecJ_tex) )
            
            self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32))
            self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32))
            
            texReflist = list(self.tex_ref[f])
            
            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist)
            
        
        #transform X to particular format
        v,c,r=spf.csr2ellpack(self.X,align=self.prefetch)
        #copy format data structure to gpu memory
        
        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)
        
        self.g_cls_start = cuda.to_device(self.cls_start)
Пример #5
0
 print 'CPU RBF takes',t1-t0, 's'
 kij= np.array( [ki,kj]).flatten()
 print 'Total sum:',kij.sum()
 print kij[0:1000:skip]
 
 
 import pycuda.driver as cuda
 import pycuda.tools
 import pycuda.autoinit
 from pycuda.compiler import SourceModule
 
 ##----------------------------------------------
 # Ellpakc gpu kernel
 
 
 v,c,r=spf.csr2ellpack(X,align=prefetch)
 
 sd=rbf.Diag
 self_dot = rbf.Xsquare
 results = np.zeros(2*num_el,dtype=np.float32)
 
 kernel_file = "ellpackKernel.cu"
 
 with open (kernel_file,"r") as CudaFile:
     data = CudaFile.read();
 
 #copy memory to device
 g_val = cuda.to_device(v)
 g_col = cuda.to_device(c)
 g_r   = cuda.to_device(r)
 g_self = cuda.to_device(self_dot)
Пример #6
0
    def test_csr2sertilp_class_grather_than_slice_size_unequal(self):
        
        threadsPerRow=2
        prefetch=2
        sliceSize=2
        minAlign=threadsPerRow*sliceSize

#        mat = np.array([ [1,2,3,4,5,6] ])
#        y = np.array([0]) 

#        mat = np.array([ [1,0,2,0,3,0], 
#                         [1,2,0,0,0,0],
#                         [1,2,3,4,5,6] ])#,  
#        y = np.array([0,0,0]) 
        

        mat = np.array([ [1,0,2,0,3,0], 
                         [1,2,0,0,0,0],
                         [1,2,3,4,5,6],

                         [6,5,4,3,2,1],
                         [0,0,0,6,7,0],
                         [0,0,0,0,0,8],

                         [9,0,10,0,11,0], 
                         [12,0,13,0,0,0],
                         [0,0,0,14,15,0],

                         [1,2,3,4,5,17],
                         [0,0,0,0,0,18],
                         [0,0,0,0,0,19]
                       ])
        y = np.array([0,0,0,1,1,1,2,2,2,3,3,3])

        sp_mat = sp.csr_matrix(mat)
        row_len_right = np.array([1,1,2,2,1,1,1,1,1,2,1,1])        
        sl_start_right = np.array([ 0,  8, 24, 40, 48, 56, 64, 80, 88])
        cls_slice_right = np.array([0,2,4,6,8])
        
        val_right = np.array([  1.,   2.,   1.,   2.,   3.,   0.,   0.,   0.,   1.,   2.,   0.,
                                0.,   3.,   4.,   0.,   0.,   5.,   6.,   0.,   0.,   0.,   0.,
                                0.,   0.,   6.,   5.,   6.,   7.,   4.,   3.,   0.,   0.,   2.,
                                1.,   0.,   0.,   0.,   0.,   0.,   0.,   8.,   0.,   0.,   0.,
                                0.,   0.,   0.,   0.,   9.,  10.,  12.,  13.,  11.,   0.,   0.,
                                0.,  14.,  15.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,
                                18.,   0.,   3.,   4.,   0.,   0.,   5.,  17.,   0.,   0.,   0.,
                                0.,   0.,   0.,  19.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])
                            
                              
        col_right = np.array([0, 2, 0, 1, 4, 0, 0, 0, 0, 1, 0, 0, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0,
                              0, 0, 1, 3, 4, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0,
                              0, 0, 0, 2, 0, 2, 4, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 2,
                              3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0])
        
        val,col,row_len,sl_start, cls_slice=spf.csr2sertilp_class(sp_mat,y,
                                            threadsPerRow=threadsPerRow, 
                                            prefetch=prefetch,
                                            sliceSize=sliceSize,
                                            minAlign=minAlign)
                                                    
        self.assertTrue(np.allclose(row_len,row_len_right), 'sliced ellpack row length arrays are not equal')
        self.assertTrue(np.allclose(sl_start,sl_start_right), 'sliced ellpack slice start arrays are not equal')       
        self.assertTrue(np.allclose(cls_slice,cls_slice_right), 'sliced ellpack class slice start arrays are not equal')       
        self.assertTrue(np.allclose(val,val_right), 'sliced ellpack values arrays are not equal')
        self.assertTrue(np.allclose(col,col_right), 'sliced ellpack collumns arrays are not equal')
Пример #7
0
    def init_cuda(self, X, Y, cls_start, max_kernels=1):

        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels

        self.X = X
        self.Y = Y

        self.cls_start = cls_start.astype(np.int32)

        #handle to gpu memory for y for each concurrent classifier
        self.g_y = []
        #handle to gpu memory for results for each concurrent classifier
        self.g_out = []  #gpu kernel out
        self.kernel_out = []  #cpu kernel out
        #blocks per grid for each concurrent classifier
        self.bpg = []

        #function reference
        self.func = []

        #texture references for each concurrent kernel
        self.tex_ref = []

        #main vectors
        #gpu
        self.g_vecI = []
        self.g_vecJ = []
        #cpu
        self.main_vecI = []
        self.main_vecJ = []

        #cpu class
        self.cls_count = []
        self.cls = []
        #gpu class
        self.g_cls_count = []
        self.g_cls = []

        self.sum_cls = []

        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)
            #            self.func.append(0)
            #            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
            #            self.main_vecI.append(0)
            #            self.main_vecJ.append(0)
            self.sum_cls.append(0)

        self.N, self.Dim = X.shape
        column_size = self.N * 4
        cacheMB = self.cache_size * 1024 * 1024  #100MB for cache size

        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB / column_size).astype(int)

        cache_items = min(self.N, cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)

        self.compute_diag()

        #cuda initialization
        cuda.init()

        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code
        with open(self.module_file, "r") as CudaFile:
            module_code = CudaFile.read()

        #compile module
        self.module = SourceModule(module_code, keep=True, no_extern_c=True)

        (g_gamma, gsize) = self.module.get_global('GAMMA')
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma))

        #get functions reference

        Dim = self.Dim
        vecBytes = Dim * 4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex = self.module.get_texref('VecI_TexRef')
            self.g_vecI[f] = cuda.mem_alloc(vecBytes)
            vecI_tex.set_address(self.g_vecI[f], vecBytes)

            #init texture for vector J
            vecJ_tex = self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f] = cuda.mem_alloc(vecBytes)
            vecJ_tex.set_address(self.g_vecJ[f], vecBytes)

            self.tex_ref.append((vecI_tex, vecJ_tex))

            self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32))
            self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32))

            texReflist = list(self.tex_ref[f])

            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist)

        #transform X to particular format
        v, c, r = spf.csr2ellpack(self.X, align=self.prefetch)
        #copy format data structure to gpu memory

        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)

        self.g_cls_start = cuda.to_device(self.cls_start)