Python segmented_sort 예제들, numbapro.cudalib.sorting.segsort.segmented_sort Python 예제들

예제 #1

0

파일 보기

파일: dks.py 프로젝트: MShaffar19/numbapro-spca

    def flush(self, metric_opt, supp_opt):
        if not self.Vcs:
            # Nothing to do
            return metric_opt, supp_opt

        k = self.k
        V = self.V

        topk_list = []

        nodect = V.shape[0]
        numseg = len(self.Vcs)
        assert nodect
        assert numseg
        eachsize = nodect * numseg
        D = np.zeros(eachsize, dtype=np.float32)

        # Fill buffer for segmented sort
        for i, Vc in enumerate(self.Vcs):
            D[i * nodect:(i + 1) * nodect] = Vc[:, 0]

        # Prepare for GPU segmented sort
        dD = cuda.to_device(D)
        dI = cuda.device_array((numseg, nodect), dtype=np.uint32)

        blksz = 32
        init_indices[(divup(dI.shape[0], blksz),
                      divup(dI.shape[1], blksz)),
                     (blksz, blksz)](dI)

        if numseg == 1:
            segments = np.arange(1, dtype=np.int32)
        else:
            segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect

        segmented_sort(dD, dI, cuda.to_device(segments))

        for i in range(numseg):
            topk = dI[i, -k:].copy_to_host()
            topk_list.append(topk)

        # Reduce
        for topk in topk_list:
            # Assume A is huge
            metric = np.linalg.norm(V[topk, :]) ** 2
            if metric > metric_opt:
                metric_opt = metric
                supp_opt = topk

        # Clear all Vc
        self.Vcs.clear()
        return metric_opt, supp_opt

예제 #2

0

파일 보기

파일: spca.py 프로젝트: mahjoubihamza/numbapro-spca

def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon)**d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples),
                               order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x

예제 #3

0

파일 보기

파일: spca.py 프로젝트: ContinuumIO/numbapro-spca

def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon) ** d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples), order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)



        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x