def __init__( self, ta=1, tpb=256, sm_size=48 * 256 * 4, ): super(GetDivOfAddressV2CUDA, self).__init__() self.ta = ta # how many clusters each thread is responsible of self.tpb = tpb self.sm_size = sm_size assert ta * tpb * 8 <= sm_size with open(get_absolute_path("kernels", "GetDivOfAddressV2Kernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_TA_", str(ta)).replace("_TPB_", str(tpb))) self.fn = cp.RawKernel( kernel, 'get_div_of_address', backend='nvcc', # options=('--maxrregcount=255',), ) self.fn.max_dynamic_shared_size_bytes = ta * tpb * 8
def __init__( self, de=16, dk=16, sm_size=48 * 256 * 4, ): super(ComputeCentroidsCUDA, self).__init__() self.de = de self.dk = dk assert dk * (de + 1) * 4 <= sm_size self.tpb = 256 self.sm_size = sm_size with open( get_absolute_path("kmeans", "kernels", "ComputeCentroidsKernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_DE_", str(de)).replace( "_DK_", str(dk)).replace("_TPB_", str(self.tpb)).replace( "_NITERS_", str(math.ceil(dk / self.tpb)))) self.fn = cp.RawKernel( kernel, 'compute_centroids', # options=('--maxrregcount=255',), # backend='nvcc', ) self.fn.max_dynamic_shared_size_bytes = sm_size
def __init__( self, tpb=256, sm_size=48 * 256 * 4, ): super(GetIOACUDA, self).__init__() self.tpb = tpb self.sm_size = sm_size with open(get_absolute_path("kernels", "GetIOAKernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_TPB_", str(tpb))) self.fn = cp.RawKernel( kernel, 'get_ioa', backend='nvcc', # options=('--maxrregcount=255',), )
def __init__( self, m=8, k=256, n_cs=4, sm_size=48 * 256 * 4, ): super(ComputeProductCUDA, self).__init__() self.m = m self.k = k self.tpb = 256 self.n_cs = n_cs self.sm_size = sm_size with open(get_absolute_path("kernels", "ComputeProductKernel.cu"), "r") as f: self.kernel = f.read() cb1 = [ f" float Bval{i} = Bsh[(i * _NCS_ + {i}) * _K_ + int(Avals.d{i}) ];" for i in range(n_cs) ] cb2 = [f" sum += Bval{i};" for i in range(n_cs)] codeblock = "\n".join(cb1) + "\n" + "\n".join(cb2) varnames = ", ".join([f"d{i}" for i in range(n_cs)]) kernel = (self.kernel.replace("_CODEBLOCK_", codeblock).replace( "_VARNAMES_", varnames).replace("_M_", str(m)).replace("_K_", str(k)).replace( "_TPB_", str(self.tpb)).replace("_NCS_", str(n_cs))) # print(kernel.split('\n')[60:64]) self.fn = cp.RawKernel( kernel, 'compute_product', # options=('--maxrregcount=255',), # backend='nvcc', ) self.fn.max_dynamic_shared_size_bytes = sm_size
def __init__( self, tm=2, td=8, ): super(PQDecodeCUDA, self).__init__() self.tm = tm self.td = td self.tpb = 256 self.sm_size = td * tm * 256 * 4 with open(get_absolute_path("kernels", "PQDecodeKernel.cu"), "r") as f: self.kernel = f.read() kernel = (self.kernel.replace("_TD_", str(td)).replace( "_TM_", str(tm)).replace("_TPB_", str(self.tpb))) self.fn = cp.RawKernel( kernel, 'pq_decode', ) self.fn.max_dynamic_shared_size_bytes = self.sm_size
def __init__(self, m=None, n=None, k=None, dim=None, distance="euclidean"): super(MaxSimCUDA, self).__init__() self.m = m self.n = n self.k = k self.dim = dim self.distance = distance with open(get_absolute_path("kmeans", "kernels", "MaxSimKernel.cu"), 'r') as f: self.kernel = f.read() if distance in ["euclidean", "l2"]: distfn = "thread_nseuclidean" elif distance in ["manhattan", "l1"]: distfn = "thread_nmanhattan" elif distance == "inner": distfn = "thread_matmul" elif distance == "cosine": print( "warning: input matrices will not be normalized, please normalize them manually for cosine similarity" ) distfn = "thread_matmul" else: raise ValueError("unrecognized distance type") self.kernel = (self.kernel.replace( "_M_", str(m) if m else "M").replace( "_N_", str(n) if n else "N").replace( "_K_", str(k) if k else "K").replace( "_DIM_", str(dim) if dim else "DIM").replace( "_DISTFN_", distfn)) # self._raw_module = cp.RawModule( # code=self.kernel, # backend='nvcc', # options=('--maxrregcount=128', '--use_fast_math'), # ) self._fn_tt = cp.RawKernel(code=self.kernel, name="max_sim_tt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nn = cp.RawKernel(code=self.kernel, name="max_sim_nn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_tn = cp.RawKernel(code=self.kernel, name="max_sim_tn", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math')) self._fn_nt = cp.RawKernel(code=self.kernel, name="max_sim_nt", backend='nvcc', options=('--maxrregcount=128', '--use_fast_math'))