def flush(self, metric_opt, supp_opt): if not self.Vcs: # Nothing to do return metric_opt, supp_opt k = self.k V = self.V topk_list = [] nodect = V.shape[0] numseg = len(self.Vcs) assert nodect assert numseg eachsize = nodect * numseg D = np.zeros(eachsize, dtype=np.float32) # Fill buffer for segmented sort for i, Vc in enumerate(self.Vcs): D[i * nodect:(i + 1) * nodect] = Vc[:, 0] # Prepare for GPU segmented sort dD = cuda.to_device(D) dI = cuda.device_array((numseg, nodect), dtype=np.uint32) blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz)](dI) if numseg == 1: segments = np.arange(1, dtype=np.int32) else: segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect segmented_sort(dD, dI, cuda.to_device(segments)) for i in range(numseg): topk = dI[i, -k:].copy_to_host() topk_list.append(topk) # Reduce for topk in topk_list: # Assume A is huge metric = np.linalg.norm(V[topk, :]) ** 2 if metric > metric_opt: metric_opt = metric supp_opt = topk # Clear all Vc self.Vcs.clear() return metric_opt, supp_opt
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon)**d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = cuda.stream() # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = Vd.dot(c) # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon) ** d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = cuda.stream() # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = Vd.dot(c) # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x